if (!is.na(fig_dir)) system(sprintf("mkdir -p %s", fig_dir))
plot_collision <- function(df, threshold=0.95) {
xmax <- max(df$HUMAN_READS * 1.1)
ymax <- max(df$MOUSE_READS * 1.1)
df %>%
mutate(CELL=case_when(HUMAN_PERCENT >= threshold ~ "Human",
MOUSE_PERCENT >= threshold ~ "Mouse",
TRUE ~ "Collision")) %>%
mutate(CELL=factor(CELL, levels=c("Human", "Collision", "Mouse"))) %>%
ggplot(aes(HUMAN_READS, MOUSE_READS, color=CELL)) +
geom_point(alpha=0.3) +
xlab("# of human unique insertions") +
ylab("# of mouse unique insertions") +
scale_color_manual(values=c("Human"="orangered", "Mouse"="dodgerblue", "Collision"="grey80")) +
scale_x_continuous(labels=scales::format_format(scientific = FALSE),
breaks=scales::pretty_breaks(n=4)) +
scale_y_continuous(labels=scales::format_format(scientific = FALSE),
breaks=scales::pretty_breaks(n=4)) +
coord_cartesian(xlim=c(0, xmax), ylim=c(0, ymax)) +
theme_bw() +
theme(legend.title=element_blank(),
legend.justification=c(1, 1),
legend.position=c(0.95, 0.95),
legend.background=element_rect(colour="black", size=0.5),
axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
}
df <- read_delim(slfile("lib1-6forCollision.txt"), col_names=c("CELL_ID", "HUMAN_READS", "MOUSE_READS", "TOTAL_READS", "HUMAN_PERCENT", "MOUSE_PERCENT"), delim=" ")
# df %>%
# mutate(CELL=case_when(HUMAN_PERCENT >= 0.9 ~ "Human",
# MOUSE_PERCENT >= 0.9 ~ "Mouse",
# TRUE ~ "Collision")) %>%
# mutate(CELL=factor(CELL, levels=c("Human", "Collision", "Mouse"))) %>%
# count(CELL)
p <- plot_collision(df)
p
if (!is.na(fig_dir)) ggsave("fig1C.pdf", p, "pdf", fig_dir, height=2.5, width=3)
# Summary stats used in text
threshold <- 0.95
df %>%
mutate(CELL=case_when(HUMAN_PERCENT >= threshold ~ "Human",
MOUSE_PERCENT >= threshold ~ "Mouse",
TRUE ~ "Collision")) %>%
count(CELL)
## # A tibble: 3 x 2
## CELL n
## <chr> <int>
## 1 Collision 48
## 2 Human 719
## 3 Mouse 315
df %>%
mutate(CELL=case_when(HUMAN_PERCENT >= threshold ~ "Human",
MOUSE_PERCENT >= threshold ~ "Mouse",
TRUE ~ "Collision")) %>%
group_by(CELL) %>%
summarise(HUMAN_PERCENT=median(HUMAN_PERCENT), MOUSE_PERCENT=median(MOUSE_PERCENT))
## # A tibble: 3 x 3
## CELL HUMAN_PERCENT MOUSE_PERCENT
## <chr> <dbl> <dbl>
## 1 Collision 0.654 0.346
## 2 Human 0.998 0.00243
## 3 Mouse 0.0129 0.987
human_mouse_boxplot <- function(human_df, mouse_df) {
bind_rows(human_df %>% mutate(CELL="Human"),
mouse_df %>% mutate(CELL="Mouse")) %>%
filter(TN5_TYPE == "Concentrated") %>%
ggplot(aes(CELL, N_UNIQ_READS, fill=CELL)) +
geom_boxplot(outlier.colour="grey50", outlier.alpha=0.5) +
scale_fill_manual(values=c("Human"="orangered", "Mouse"="dodgerblue")) +
xlab("") +
ylab("# of unique insertions") +
theme_bw() +
scale_y_continuous(labels=scales::format_format(scientific = FALSE),
breaks=scales::pretty_breaks(n=4)) +
theme(legend.position='none',
axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
}
unique_reads_boxplot <- function(df) {
df %>%
ggplot(aes(TN5_TYPE, N_UNIQ_READS, fill=TN5_TYPE)) +
geom_boxplot(outlier.colour="grey50", outlier.alpha=0.5) +
scale_fill_manual(values=c("Concentrated"="orangered", "Diluted"="dodgerblue")) +
xlab("Transposon concentration") +
ylab("# of human unique insertions") +
theme_bw() +
scale_y_continuous(labels=scales::format_format(scientific = FALSE),
breaks=scales::pretty_breaks(n=4)) +
theme(legend.position='none',
axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
}
yi129_h_m_df <- read_delim(slfile("yi129_single_cell_summary_all.txt"), col_names=c("CELL_ID", "HUMAN_READS", "MOUSE_READS", "TOTAL_READS", "HUMAN_PERCENT", "MOUSE_PERCENT"), delim=" ")
yi129_human_df <- read_delim(slfile("yi129_single_cell_summary_bed.txt"), col_names=c("CELL_ID", "x", "y", "z"), delim=" ") %>%
mutate(N_UNIQ_READS=y * 2) %>%
filter(CELL_ID %in% yi129_h_m_df$CELL_ID[yi129_h_m_df$HUMAN_PERCENT >= 0.9]) %>%
separate(CELL_ID, c("LIB", "BC"), sep="_", remove=FALSE) %>%
separate(BC, c("SSS", "ELSE"), sep="\\.") %>%
separate(ELSE, c("TN5", "LIGATION"), sep=8) %>%
mutate(TN5_TYPE=case_when(TN5 %in% c("CATCAAGG", "TCCTTGTG") ~ "Diluted",
TN5 %in% c("CCGCGCTT", "GTTACGCG", "TCTTAGTG") ~ "Concentrated"))
yi129_mouse_df <- read_delim(slfile("yi129_single_cell_summary_mouse_bed.txt"), col_names=c("CELL_ID", "x", "y", "z"), delim=" ") %>%
mutate(N_UNIQ_READS=y * 2) %>% # because paired reads and this is only R1
filter(CELL_ID %in% yi129_h_m_df$CELL_ID[yi129_h_m_df$MOUSE_PERCENT >= 0.9]) %>%
separate(CELL_ID, c("LIB", "BC"), sep="_", remove=FALSE) %>%
separate(BC, c("SSS", "ELSE"), sep="\\.") %>%
separate(ELSE, c("TN5", "LIGATION"), sep=8) %>%
mutate(TN5_TYPE=case_when(TN5 %in% c("CATCAAGG", "TCCTTGTG") ~ "Diluted",
TN5 %in% c("CCGCGCTT", "GTTACGCG", "TCTTAGTG") ~ "Concentrated"))
p <- human_mouse_boxplot(yi129_human_df, yi129_mouse_df)
p
if (!is.na(fig_dir)) ggsave("fig1D.pdf", p, "pdf", fig_dir, height=2.5, width=2.5)
yi128_h_m_df <- read_delim(slfile("yi128_single_cell_summary_all.txt"),
col_names=c("CELL_ID", "HUMAN_READS", "MOUSE_READS", "TOTAL_READS", "HUMAN_PERCENT", "MOUSE_PERCENT"),
delim=" ")
yi128_human_df <- read_delim(slfile("yi128_single_cell_summary_bed.txt"),
col_names=c("CELL_ID", "x", "y", "z"), delim=" ") %>%
mutate(N_UNIQ_READS=y * 2) %>%
filter(CELL_ID %in% yi128_h_m_df$CELL_ID[yi128_h_m_df$HUMAN_PERCENT >= 0.9]) %>%
separate(CELL_ID, c("LIB", "BC"), sep="_", remove=FALSE) %>%
separate(BC, c("SSS", "ELSE"), sep="\\.") %>%
separate(ELSE, c("TN5", "LIGATION"), sep=8) %>%
mutate(TN5_TYPE=case_when(TN5 %in% c("CATCAAGG", "TCCTTGTG") ~ "Diluted",
TN5 %in% c("CCGCGCTT", "GTTACGCG", "TCTTAGTG") ~ "Concentrated"))
yi128_mouse_df <- read_delim(slfile("yi128_single_cell_summary_mouse_bed.txt"),
col_names=c("CELL_ID", "x", "y", "z"), delim=" ") %>%
mutate(N_UNIQ_READS=y * 2) %>%
filter(CELL_ID %in% yi128_h_m_df$CELL_ID[yi128_h_m_df$MOUSE_PERCENT >= 0.9]) %>%
separate(CELL_ID, c("LIB", "BC"), sep="_", remove=FALSE) %>%
separate(BC, c("SSS", "ELSE"), sep="\\.") %>%
separate(ELSE, c("TN5", "LIGATION"), sep=8) %>%
mutate(TN5_TYPE=case_when(TN5 %in% c("CATCAAGG", "TCCTTGTG") ~ "Diluted",
TN5 %in% c("CCGCGCTT", "GTTACGCG", "TCTTAGTG") ~ "Concentrated"))
p <- human_mouse_boxplot(yi128_human_df, yi128_mouse_df)
p
if (!is.na(fig_dir)) ggsave("figS1A1.pdf", p, "pdf", fig_dir, height=2.5, width=2.5)
p <- unique_reads_boxplot(yi128_human_df)
p
if (!is.na(fig_dir)) ggsave("figS1A2.pdf", p, "pdf", fig_dir, height=2.5, width=2.5)
p <- unique_reads_boxplot(yi129_human_df)
p
if (!is.na(fig_dir)) ggsave("figS1B.pdf", p, "pdf", fig_dir, height=2.5, width=2.5)
df_140 <- read_delim(slfile("yi140_GAACCG.single_cells_all.txt"), col_names=FALSE, delim=" ")
df_141 <- read_delim(slfile("yi141_AGGATG.single_cells_all.txt"), col_names=FALSE, delim=" ")
df_144 <- read_delim(slfile("yi144_single_cell_summary_all.txt"), col_names=FALSE, delim=" ")
df_145 <- read_delim(slfile("yi145_single_cell_summary_all.txt"), col_names=FALSE, delim=" ")
df <- bind_rows(
bind_rows(df_140 %>% mutate(LIB="yi140, 141"),
df_141 %>% mutate(LIB="yi140, 141")) %>%
top_n(191, X4),
bind_rows(df_144 %>% mutate(LIB="yi144, 145"),
df_145 %>% mutate(LIB="yi144, 145")) %>%
top_n(2200, X4)
)
p1 <- df %>%
filter(LIB == "yi140, 141") %>%
ggplot(aes(LIB, X4 * 2)) +
geom_boxplot(outlier.colour="grey50", outlier.alpha=0.5, fill="orangered") +
xlab("") +
ylab("# of human unique insertions") +
theme_bw() +
scale_y_continuous(labels=scales::format_format(scientific = FALSE),
breaks=scales::pretty_breaks(n=4)) +
theme(legend.position='none',
axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
p2 <- df %>%
filter(LIB == "yi144, 145") %>%
ggplot(aes(LIB, X4 * 2)) +
geom_boxplot(outlier.colour="grey50", outlier.alpha=0.5, fill="dodgerblue") +
xlab("") +
ylab("# of human unique insertions") +
theme_bw() +
scale_y_continuous(labels=scales::format_format(scientific = FALSE),
breaks=scales::pretty_breaks(n=4)) +
theme(legend.position='none',
axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
grid.arrange(p1, p2, nrow=1)
if (!is.na(fig_dir)) {
ggsave("figS1C.pdf", p1, "pdf", fig_dir, height=2.5, width=2)
ggsave("figS1D.pdf", p2, "pdf", fig_dir, height=2.5, width=2)
}
yi174_h_df <- read_delim(slfile("yi174.human.txt"), col_names=c("CELL_ID", "HUMAN_READS", "MOUSE_READS", "TOTAL_READS", "HUMAN_PERCENT", "MOUSE_PERCENT"), delim=" ")
cols <- c("CELL_ID", "x", "UNIQ_READS", "DEPTH", "z")
yi174_human_df <- bind_rows(
read_delim(slfile("yi174_ACCACT.single_cells_all.txt"), col_names=cols, delim=" "),
read_delim(slfile("yi174_AGGATG.single_cells_all.txt"), col_names=cols, delim=" "),
read_delim(slfile("yi174_CGCTTG.single_cells_all.txt"), col_names=cols, delim=" "),
read_delim(slfile("yi174_GTCCTA.single_cells_all.txt"), col_names=cols, delim=" "),
read_delim(slfile("yi174_TTCTCC.single_cells_all.txt"), col_names=cols, delim=" "),
read_delim(slfile("yi174_TTTCGC.single_cells_all.txt"), col_names=cols, delim=" ")
) %>%
mutate(N_UNIQ_READS=UNIQ_READS * 2) %>%
filter(CELL_ID %in% yi174_h_df$CELL_ID[yi174_h_df$HUMAN_PERCENT >= 0.9]) %>%
tidyr::extract(CELL_ID, c("TN5"), ".*\\.(........).*", remove=FALSE) %>%
mutate(TN5_TYPE=case_when(TN5 %in% c("TGATATTG", "GATCCCGT", "CTCGATTA") ~ "Concentrated",
TRUE ~ "Diluted"))
yi174_m_df <- bind_rows(
read_tsv(slfile("yi174_ACCACT.mouse.txt"), col_names=c("CELL_ID")),
read_tsv(slfile("yi174_AGGATG.mouse.txt"), col_names=c("CELL_ID")),
read_tsv(slfile("yi174_CGCTTG.mouse.txt"), col_names=c("CELL_ID")),
read_tsv(slfile("yi174_GTCCTA.mouse.txt"), col_names=c("CELL_ID")),
read_tsv(slfile("yi174_TTCTCC.mouse.txt"), col_names=c("CELL_ID")),
read_tsv(slfile("yi174_TTTCGC.mouse.txt"), col_names=c("CELL_ID"))
)
cols <- c("CELL_ID", "x", "UNIQ_READS", "z")
yi174_mouse_df <- bind_rows(
read_delim(slfile("yi174_ACCACT.mouse.single_cells_bed.txt"), col_names=cols, delim=" "),
read_delim(slfile("yi174_AGGATG.mouse.single_cells_bed.txt"), col_names=cols, delim=" "),
read_delim(slfile("yi174_CGCTTG.mouse.single_cells_bed.txt"), col_names=cols, delim=" "),
read_delim(slfile("yi174_GTCCTA.mouse.single_cells_bed.txt"), col_names=cols, delim=" "),
read_delim(slfile("yi174_TTCTCC.mouse.single_cells_bed.txt"), col_names=cols, delim=" "),
read_delim(slfile("yi174_TTTCGC.mouse.single_cells_bed.txt"), col_names=cols, delim=" ")
) %>%
mutate(N_UNIQ_READS=UNIQ_READS * 2) %>%
filter(CELL_ID %in% yi174_m_df$CELL_ID) %>%
tidyr::extract(CELL_ID, c("TN5"), ".*\\.(........).*", remove=FALSE) %>%
mutate(TN5_TYPE=case_when(TN5 %in% c("TGATATTG", "GATCCCGT", "CTCGATTA") ~ "Concentrated",
TRUE ~ "Diluted"))
p <- unique_reads_boxplot(yi174_human_df)
p
if (!is.na(fig_dir)) ggsave("figS1E1.pdf", p, "pdf", fig_dir, height=2.5, width=2.5)
p <- human_mouse_boxplot(yi174_human_df, yi174_mouse_df)
p
if (!is.na(fig_dir)) ggsave("figS1E2.pdf", p, "pdf", fig_dir, height=2.5, width=2.5)
depth_beds <- list.files(slfile("read_depth"), full.names=TRUE)
names(depth_beds) <- list.files(slfile("read_depth"))
depth_df <- map_df(depth_beds, function(x) {
read_delim(x, col_names=FALSE, delim=" ")
}, .id="LIB")
depth_df <- depth_df %>%
mutate(RUN=gsub("_.*", "", LIB)) %>%
mutate(UNIQ_READS=X4 * 2) %>%
mutate(LIB=gsub("\\.single.*|_merge.*", "", LIB)) %>%
mutate(BC=gsub(".*_", "", LIB)) %>%
group_by(RUN) %>%
mutate(BC_ID=as.numeric(as.factor(BC))) %>%
ungroup() %>%
mutate(LIB_ID=paste(RUN, BC_ID, sep=".")) %>%
mutate(LIB_ID=factor(LIB_ID, levels=unique(.$LIB_ID)))
p <- ggplot(depth_df, aes(LIB_ID, UNIQ_READS, fill=grepl("yi19", RUN))) +
geom_boxplot(outlier.colour="grey50", outlier.alpha=0.5, outlier.size=0.5) +
scale_fill_manual(values=c("orangered", "dodgerblue")) +
xlab("") +
ylab("# of unique insertions") +
theme_bw() +
scale_y_continuous(labels=scales::format_format(scientific = FALSE),
breaks=scales::pretty_breaks(n=4), limits=c(0, 2500000)) +
theme(legend.position='none',
axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.x = element_text(angle=90, hjust=1))
p
if (!is.na(fig_dir)) ggsave("figS1F.pdf", p, "pdf", fig_dir, height=4, width=7)
Extrapolation through down sampling to infer relationship between number of unique reads and sequencing depth.
celltype_annot <- read.table(slfile("barcode_tn5.txt"), stringsAsFactors=FALSE, header=TRUE)
ds_df1 <- read.table(slfile("downsample_summary.tab"), stringsAsFactors=FALSE) %>%
dplyr::rename(N_READS=V1, FILE=V2) %>%
filter(FILE != "total" & !grepl("\\.TCCTTGTG", FILE)) %>%
mutate(TYPE=case_when(grepl("uniq", FILE) ~ "UNIQ",
TRUE ~ "TOTAL"),
FILE=gsub(".uniq", "", FILE)) %>%
tidyr::extract(FILE, c("CELL_ID", "FRAC"), "(.*).R1.human.Qgt0.(.*).bed", remove=FALSE) %>%
spread(TYPE, N_READS) %>%
mutate(DOWNSAMPLE=as.numeric(FRAC) / 10) %>%
mutate(RATIO=TOTAL / UNIQ) %>%
separate(CELL_ID, c("LIB", "BC"), sep="_", remove=FALSE) %>%
separate(BC, c("SSS", "ELSE"), sep="\\.") %>%
separate(ELSE, c("TN5", "LIGATION"), sep=8) %>%
left_join(celltype_annot, by="TN5") %>%
mutate(UNIQ = UNIQ * 2) # because these are read pairs
md1 <- lm(UNIQ ~ log10(RATIO), data=ds_df1)
pred1 <- predict(md1, ds_df1)
pred_intv1 <- predict(md1, interval="predict")
pred_df1 <- ds_df1 %>%
mutate(FIT=pred_intv1[, "fit"],
LOWER=pred_intv1[, "lwr"],
UPPER=pred_intv1[, "upr"])
predict(md1, newdata=data.frame("RATIO"=5))
## 1
## 1852515
predict(md1, newdata=data.frame("RATIO"=10))
## 1
## 2624546
ds_df2 <- read.table(slfile("yi140_GAACCG_downsample_summary.tab"), stringsAsFactors=FALSE) %>%
bind_rows(read.table(slfile("yi141_AGGATG_downsample_summary.tab"), stringsAsFactors=FALSE)) %>%
dplyr::rename(N_READS=V1, FILE=V2) %>%
mutate(FILE=gsub("downsample/", "", FILE)) %>%
filter(FILE != "total" & !grepl("uniq[1-9]", FILE)) %>%
mutate(TYPE=case_when(grepl("uniq", FILE) ~ "UNIQ",
TRUE ~ "TOTAL"),
FILE=gsub(".uniq", "", FILE)) %>%
tidyr::extract(FILE, c("CELL_ID", "FRAC"), "(.*).R1.human.Qgt0.(.*).bed", remove=FALSE) %>%
spread(TYPE, N_READS) %>%
mutate(DOWNSAMPLE=as.numeric(FRAC) / 10) %>%
mutate(RATIO=TOTAL / UNIQ) %>%
separate(CELL_ID, c("LIB", "BC"), sep="_", remove=FALSE) %>%
separate(BC, c("SSS", "ELSE"), sep="\\.") %>%
separate(ELSE, c("TN5", "LIGATION"), sep=8) %>%
left_join(celltype_annot, by="TN5") %>%
mutate(UNIQ = UNIQ * 2) # because these are read pairs
md2 <- lm(UNIQ ~ log10(RATIO), data=ds_df2)
pred2 <- predict(md2, ds_df2)
pred_intv2 <- predict(md2, interval="predict")
pred_df2 <- ds_df2 %>%
mutate(FIT=pred_intv2[, "fit"],
LOWER=pred_intv2[, "lwr"],
UPPER=pred_intv2[, "upr"])
predict(md2, newdata=data.frame("RATIO"=1.78))
## 1
## 1521039
predict(md2, newdata=data.frame("RATIO"=5))
## 1
## 4231109
predict(md2, newdata=data.frame("RATIO"=10))
## 1
## 6049887
p <- bind_rows(pred_df1 %>% mutate(CAT="yi129"), pred_df2 %>% mutate(CAT="yi140, 141")) %>%
ggplot(aes(RATIO, UNIQ, color=CAT)) +
geom_point(alpha=0.05) +
geom_line(aes(RATIO, FIT, color=CAT), size=1, alpha=1) +
scale_color_manual(values=c("orangered", "dodgerblue"), name="Library") +
xlab("Depth") +
ylab("# of unique insertions") +
scale_y_continuous(labels=scales::format_format(scientific = FALSE)) +
# geom_ribbon(aes(x=RATIO, ymin=LOWER, ymax=UPPER), alpha=0.3, fill="orangered") +
theme_classic() +
theme(legend.position=c(1, 1), legend.justification=c(1, 1))
p
if (!is.na(fig_dir)) ggsave("figS1G.pdf", p, "pdf", fig_dir, height=2.5, width=3.5)
df <- read_delim(slfile("lib1-6forCollision.txt"), col_names=c("CELL_ID", "HUMAN_READS", "MOUSE_READS", "TOTAL_READS", "HUMAN_PERCENT", "MOUSE_PERCENT"), delim=" ")
df <- df %>%
mutate(CELL=case_when(HUMAN_PERCENT >= 0.95 ~ "Human",
MOUSE_PERCENT >= 0.95 ~ "Mouse",
TRUE ~ "Collision")) %>%
mutate(CELL=factor(CELL, levels=c("Human", "Collision", "Mouse")))
fen<-df %>%
filter(CELL=="Collision")
median(fen$MOUSE_PERCENT)
## [1] 0.3461845
prop_files <- list.files("collision/", ".*prop.txt", full.names=TRUE)
names(prop_files) <- gsub(".chr_prop.txt", "", basename(prop_files))
prop_df <- map_df(prop_files, function(f) {
read_delim(f, col_names=c("CHROM", "CHROM_SIZE", "CHROM_UNIQ_READS", "CHROM_READS_FRAC", "TOTAL_UNIQ_READS", "RATIO"), delim=" ")
}, .id="CELL_ID") %>%
left_join(df, by="CELL_ID") %>%
mutate(HUMAN_CHROM_READS_FRAC=CHROM_UNIQ_READS / HUMAN_READS)
qt1_files <- list.files("collision/", ".idx", full.names=TRUE)
names(qt1_files) <- gsub(".gt1.idx", "", basename(qt1_files))
qt1_df <- map_df(qt1_files, function(f) {
read_table(f, col_names="CHROM_UNIQ_READS") %>%
mutate(CHROM=c(1:22, "X", "Y")) %>%
mutate(HUMAN_READS=sum(CHROM_UNIQ_READS)) %>%
mutate(HUMAN_CHROM_READS_FRAC=CHROM_UNIQ_READS / HUMAN_READS)
}, .id="CELL_ID")
prop_df <- prop_df %>%
left_join(qt1_df, by=c("CELL_ID", "CHROM"), suffix=c("", "_QT1")) %>%
filter(!CHROM %in% c("X", "Y")) %>%
mutate(CHROM=factor(CHROM, levels=as.character(c(1:22))))
p<-ggplot(prop_df, aes(CHROM, HUMAN_CHROM_READS_FRAC)) +
geom_boxplot(aes(fill=CELL), outlier.shape=NA) +
scale_fill_manual(values=c("Human"="orangered", "Mouse"="dodgerblue", "Collision"="grey80")) +
coord_cartesian(ylim=c(0, 0.25)) +
labs(x="Chromosome", y="Human reads fraction for chromosome") +
theme_bw() +
theme(legend.title=element_blank(),
legend.justification=c(1, 1),
legend.position=c(0.95, 0.95),
legend.background=element_rect(colour="black", size=0.5),
axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
p
if (!is.na(fig_dir)) ggsave("figS1H.pdf", p, "pdf", fig_dir, height=3, width=6)
plot_chr <- function(df, chr_size) {
chr_size <- mutate(chr_size, Chromosome=gsub("chr", "", Chromosome))
chrs <- unique(chr_size$Chromosome)
chr_levels <- chrs[order(as.numeric(chrs))]
df$Chromosome <- factor(df$Chromosome, levels=chr_levels)
chr_size <- chr_size %>%
mutate(Chromosome=factor(Chromosome, levels=chr_levels)) %>%
arrange(Chromosome) %>%
mutate(Size_M=Size/1000000) %>%
mutate(Cumsize=cumsum(Size_M) - Size_M)
# add chromosome size to Start
df_new <- df %>%
arrange(Chromosome, Start) %>%
filter(Chromosome != "M") %>%
mutate(Start_M=Start/1000000) %>%
mutate(Cumstart=0)
for (i in seq_len(nrow(df_new))) {
this_chr <- df_new$Chromosome[i]
this_cumsize <- filter(chr_size, Chromosome == this_chr)$Cumsize
df_new[i, "Cumstart"] <- df_new[i, "Start_M"] + this_cumsize
}
g <- ggplot(df_new, aes(Cumstart, Ratio)) +
geom_vline(xintercept=chr_size$Cumsize, color="#f0f0f0") +
geom_point(aes(color=Chromosome), size=0.2, alpha=0.8) +
scale_color_manual(values=rep(c("orangered", "dodgerblue"), 12)) +
xlim(c(0, NA)) +
ylim(c(0, 5)) +
ylab("Relative chromosome\ncopy number") +
xlab("") +
theme_bw() +
theme(legend.position='none',
axis.line = element_line(colour = "black"),
axis.ticks.x = element_blank(),
axis.text.x = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
return(g)
}
h_freec_df <- read_tsv(slfile("yi129_GGGTGC.GTTACGCGGCAGTAG.bam_ratio.txt"), guess_max=1e7)
h_chr_size <- read_tsv(slfile("hg19.len"), col_names=c("nouse", "Chromosome", "Size"), guess_max=1e7)
m_freec_df <- read_tsv(slfile("yi129_GGGTGC.GTTACGCGCAAAACG.bam_ratio.txt"), guess_max=1e7)
m_chr_size <- read_tsv(slfile("mm10.len"), col_names=c("Chromosome", "Size"), guess_max=1e7)
p <- plot_chr(h_freec_df, h_chr_size)
p
if (!is.na(fig_dir)) ggsave("fig1E1.pdf", p, "pdf", fig_dir, height=2, width=12)
p <- plot_chr(m_freec_df, m_chr_size)
p
if (!is.na(fig_dir)) ggsave("fig1E2.pdf", p, "pdf", fig_dir, height=2, width=12)
aneuploidy_df <- read.table(slfile("lib144_aneuploidy.txt"), col.names=c("CELL", "CHROM", "CHROM_SIZE", "CHROM_UNIQ_READS", "CHROM_READS_FRAC", "TOTAL_UNIQ_READS", "RATIO"), stringsAsFactors=FALSE) %>%
bind_rows(read.table(slfile("lib145_aneuploidy.txt"), col.names=c("CELL", "CHROM", "CHROM_SIZE", "CHROM_UNIQ_READS", "CHROM_READS_FRAC", "TOTAL_UNIQ_READS", "RATIO"), stringsAsFactors=FALSE)) %>%
mutate(CHROM=factor(CHROM, levels=c(as.character(1:22), "X", "Y"))) %>%
mutate(CELL=gsub("./|.chr_prop.txt", "", CELL))
aneu_df <- aneuploidy_df %>%
separate(CELL, c("LIB", "BC"), sep="_", remove=FALSE) %>%
separate(BC, c("SSS", "ELSE"), sep="\\.") %>%
separate(ELSE, c("TN5", "LIGATION"), sep=8) %>%
left_join(celltype_annot, by="TN5")
p <- aneu_df %>%
# filter(CHROM != "Y") %>%
ggplot(aes(CHROM, RATIO)) +
# geom_jitter(aes(color=RATIO < 0.1), alpha=0.3) +
geom_boxplot(aes(fill=CELL_TYPE), outlier.shape=NA, outlier.stroke=NA, size=0.5) +
scale_fill_manual(values=c("orangered", "dodgerblue")) +
facet_wrap(~CELL_TYPE, nrow=2) +
coord_cartesian(ylim=c(0, 2)) +
xlab("Chromosome") +
ylab("Normalized copy number\nrelative to euploidy") +
theme_classic() +
theme(strip.background=element_blank(),
legend.position='none')
p
if (!is.na(fig_dir)) ggsave("fig1F.pdf", p, "pdf", fig_dir, width=12, height=3)
hb_example <- readRDS(slfile("yi190_ACGCGA.ATCGCGTTGGCTTGG.filtered.rds"))
hb_example_plot <- hb_example$haploidState %>%
mutate(POS=POS / 1e6) %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
ggplot(aes(POS, FREQ_ALT)) +
geom_point(alpha=0.05) +
geom_line(aes(POS, ALT_STATE - (ALT_STATE - 0.5) / 5), color="red") +
facet_wrap(~CHROM, nrow=2, scales="free_x") +
xlab("Chromosome coordinates (M)") +
ylab("Allele frequency\n(Spretus)") +
theme_classic() +
scale_y_continuous(labels=scales::format_format(scientific = FALSE),
breaks=scales::pretty_breaks(n=2)) +
theme(legend.position='none',
strip.text.x = element_text(margin = margin(0.1, 0, 0.1, 0, "cm")),
strip.background = element_blank(),
axis.text.x = element_text(angle=45, hjust = 1))
hb_example_plot
if (!is.na(fig_dir)) ggsave("fig4A.pdf", hb_example_plot, "pdf", fig_dir, width=10, height=2.5)
me_example <- readRDS(slfile("yi193_CGCTTG.CATCAAGGCACTGTT.filtered.rds"))
me_example_plot <- me_example$m2DiploidState %>%
mutate(POS=POS / 1e6) %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
ggplot(aes(POS, FREQ_ALT)) +
geom_point(alpha=0.15) +
geom_line(aes(POS, ALT_STATE), color="red") +
facet_wrap(~CHROM, nrow=2, scales="free_x") +
xlim(c(3.7, NA)) +
xlab("Chromosome coordinates (M)") +
ylab("Allele frequency\n(Spretus)") +
theme_classic() +
scale_y_continuous(labels=scales::format_format(scientific = FALSE),
breaks=scales::pretty_breaks(n=2)) +
theme(legend.position='none',
strip.text.x = element_text(margin = margin(0.1, 0, 0.1, 0, "cm")),
strip.background = element_blank(),
axis.text.x = element_text(angle=45, hjust = 1))
me_example_plot
if (!is.na(fig_dir)) ggsave("fig4B.pdf", me_example_plot, "pdf", fig_dir, width=10, height=2.5)
mt_example <- readRDS(slfile("yi193_TATTCT.ACCATTTATCGGTTT.filtered.rds"))
mt_example_plot <- mt_example$m2DiploidState %>%
# manually removed three artifacts, reflect it here
filter(!(CHROM == "chr8" & between(POS, 50e6, 100e6) & ALT_STATE == 0.5)) %>%
filter(!(CHROM == "chr16" & between(POS, 50e6, 75e6) & ALT_STATE == 0.5)) %>%
filter(!(CHROM == "chr17" & between(POS, 30e6, 50e6) & ALT_STATE == 0.5)) %>%
mutate(POS=POS / 1e6) %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
ggplot(aes(POS, FREQ_ALT)) +
geom_point(alpha=0.15) +
geom_line(aes(POS, ALT_STATE), color="red") +
facet_wrap(~CHROM, nrow=2, scales="free_x") +
xlab("Chromosome coordinates (M)") +
ylab("Allele frequency\n(Spretus)") +
theme_classic() +
scale_y_continuous(labels=scales::format_format(scientific = FALSE),
breaks=scales::pretty_breaks(n=2)) +
theme(legend.position='none',
strip.text.x = element_text(margin = margin(0.1, 0, 0.1, 0, "cm")),
strip.background = element_blank(),
axis.text.x = element_text(angle=45, hjust = 1))
mt_example_plot
if (!is.na(fig_dir)) ggsave("fig4C.pdf", mt_example_plot, "pdf", fig_dir, width=10, height=2.5)
m2_diploid <- read_tsv(slfile("m2.final.tab")) # used only here in fig 2
m2_upd <- read_tsv(slfile("m2.upd.tab")) # used only here in fig 2
hb_spret_df <- read_tsv(slfile("haploid.no0.hb.filtered.spret.tab"))
haploid_spret_upd<- read_tsv(slfile("haploid.upd.0312.spret.tab"))
hb_cast_df <- read_tsv(slfile("haploid.no0.hb.filtered.cast.tab"))
m2_cast_df <- read_tsv(slfile("m2diploid.concensus.mb.cast.tab"))
m2_cast_upd<-read_tsv(slfile("cast/m2.upd.cast.tab"))
# count chromosomes per cell that are me or mt in m2 and hp separately
# me cells
me_per_cell_m2 <- m2_diploid %>%
filter(SEGREGATION %in% c("me", "hp")) %>%
bind_rows(filter(m2_upd, STRAIN %in% c("bpNA"))) %>%
distinct(CELL_ID, CHROM) %>%
count(CELL_ID)
me_per_cell_m2_upd <- m2_upd %>%
filter(STRAIN %in% c("spret", "b6")) %>%
distinct(CELL_ID, CHROM) %>%
count(CELL_ID)
drop0_per_cell <- m2_upd %>%
filter(STRAIN %in% c("drop0", "spret/drop0")) %>%
distinct(CELL_ID, CHROM) %>%
count(CELL_ID) %>%
dplyr::rename(n_drop0=n)
me_per_cell_m2_merge <- full_join(me_per_cell_m2, me_per_cell_m2_upd, by="CELL_ID", suffix=c("_m2","_m2_upd")) %>%
full_join(drop0_per_cell, drop0_per_cell, by="CELL_ID") %>%
replace_na(list(n_m2=0, n_m2_upd=0, n_drop0=0)) %>%
mutate(SUM_m2=n_m2+n_m2_upd+n_drop0)
# mt cells
mt_per_cell_m2 <- m2_diploid %>%
filter(SEGREGATION=="mt") %>%
distinct(CELL_ID, CHROM) %>%
count(CELL_ID)
mt_per_cell_m2_upd <- m2_upd %>%
filter(STRAIN %in% c("het")) %>%
distinct(CELL_ID, CHROM) %>%
count(CELL_ID)
mt_per_cell_m2_merge <- full_join(mt_per_cell_m2, mt_per_cell_m2_upd, by="CELL_ID", suffix=c("_m2","_m2_upd")) %>%
replace_na(list(n_m2=0, n_m2_upd=0)) %>%
mutate(SUM_m2=n_m2+n_m2_upd)
all_m2_merge <- full_join(mt_per_cell_m2_merge, me_per_cell_m2_merge, by="CELL_ID", suffix=c("_mt","_me")) %>%
replace_na(list(n_m2_mt=0, n_m2_me=0, n_m2_upd_mt=0, n_m2_upd_me=0, SUM_m2_mt=0, SUM_m2_me=0)) %>%
mutate(SUM_m2 = SUM_m2_mt+SUM_m2_me) %>%
mutate(rate_m2_me = SUM_m2_me/(SUM_m2_mt+SUM_m2_me)) %>%
mutate(bp_m2 = n_m2_mt + n_m2_me) %>%
mutate(TYPE=case_when(SUM_m2_mt >= 15 ~ "Mitotic cell",
SUM_m2_me >= 15 ~ "Meiotic cell",
TRUE ~ "Others"))
p_obs <- all_m2_merge %>%
rowwise() %>%
mutate(n_na=max(0, 19 - n_m2_mt - n_m2_upd_mt - n_m2_me - n_m2_upd_me - n_drop0)) %>%
mutate(n_mt=n_m2_mt + n_m2_upd_mt, n_me=n_m2_me + n_m2_upd_me) %>%
mutate(SORT1=n_mt + n_na) %>%
arrange(n_mt, n_drop0, n_m2_upd_mt, desc(n_me)) %>%
# arrange(SORT1, n_m2_upd_mt, n_m2_mt, n_drop0, n_m2_upd_me, desc(n_m2_me)) %>%
mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
gather(Category, Count, c("n_me", "n_drop0", "n_mt")) %>%
mutate(Category=factor(Category, levels=c("n_me", "n_drop0", "n_mt"))) %>%
ggplot(aes(CELL_ID, Count, fill=Category)) +
geom_bar(stat='identity', colour=NA, width=0.8) +
scale_fill_manual(values=c("n_mt"="deepskyblue", "n_me"="firebrick1", "n_drop0"="black")) +
theme_void() +
ggtitle("Observed")
# simulate null distribution with p = 0.7606
set.seed(12345)
simulate_m2 <- list()
simulate_m2$n_me <- rbinom(length(unique(all_m2_merge$CELL_ID)), 19, 0.7606)
simulate_m2$n_mt <- 19 - simulate_m2$n_me
simulate_df <- as.data.frame(simulate_m2)
simulate_df$CELL_ID <- seq_len(nrow(simulate_df))
p_null_1 <- simulate_df %>%
arrange(desc(n_me)) %>%
mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
gather(Category, Count, c("n_me", "n_mt")) %>%
mutate(Category=factor(Category, levels=c("n_me", "n_mt"))) %>%
ggplot(aes(CELL_ID, Count, fill=Category)) +
geom_bar(stat='identity', colour=NA, width=0.8) +
scale_fill_manual(values=c("n_mt"="deepskyblue", "n_me"="firebrick1")) +
theme_void() +
ggtitle("Null: p=0.7606")
# simulate null distribution with p = 0.5
set.seed(12345)
simulate_m2 <- list()
simulate_m2$n_me <- rbinom(length(unique(all_m2_merge$CELL_ID)), 19, 0.5)
simulate_m2$n_mt <- 19 - simulate_m2$n_me
simulate_df <- as.data.frame(simulate_m2)
simulate_df$CELL_ID <- seq_len(nrow(simulate_df))
p_null_2 <- simulate_df %>%
arrange(desc(n_me)) %>%
mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
gather(Category, Count, c("n_me", "n_mt")) %>%
mutate(Category=factor(Category, levels=c("n_me", "n_mt"))) %>%
ggplot(aes(CELL_ID, Count, fill=Category)) +
geom_bar(stat='identity', colour=NA, width=0.8) +
scale_fill_manual(values=c("n_mt"="deepskyblue", "n_me"="firebrick1")) +
theme_void() +
ggtitle("Null: p=0.5")
p_obs
p_null_1
p_null_2
if (!is.na(fig_dir)) {
pdf(file.path(fig_dir, "fig4D_4E.pdf"), width=10, height=4)
print(p_obs)
print(p_null_1)
print(p_null_2)
dev.off()
}
## quartz_off_screen
## 2
all_m2_merge_biased <- all_m2_merge %>%
filter(TYPE %in% c("Mitotic cell","Meiotic cell"))
m2_spret_df <- m2_diploid %>%
filter(CELL_ID %in% all_m2_merge_biased$CELL_ID)
m2_spret_upd <- m2_upd %>%
filter(CELL_ID %in% all_m2_merge_biased$CELL_ID)
dim(m2_spret_df)
## [1] 4184 9
dim(m2_spret_upd)
## [1] 825 3
# compare crossover number between B6/Cast and B6/Spret
dim(hb_cast_df)
## [1] 60755 7
dim(m2_cast_df)
## [1] 2246 9
co_per_cell_hp_cast <- hb_cast_df %>%
group_by(CELL_ID) %>%
summarise(count = n())
co_per_cell_hp_spret <- hb_spret_df %>%
group_by(CELL_ID) %>%
summarise(count = n())
co_per_cell_m2_spret <- m2_spret_df %>%
group_by(CELL_ID) %>%
summarise(count = n())
co_per_cell_m2_cast <- m2_cast_df %>%
group_by(CELL_ID) %>%
summarise(count = n())
wilcox.test(co_per_cell_hp_cast$count, co_per_cell_hp_spret$count)$p.value
## [1] 6.716958e-26
wilcox.test(co_per_cell_m2_cast$count, co_per_cell_m2_spret$count)$p.value
## [1] 1.889733e-10
mean(co_per_cell_hp_cast$count)/19
## [1] 0.5764614
mean(co_per_cell_hp_spret$count)/19
## [1] 0.6203437
mean(co_per_cell_m2_cast$count)/19
## [1] 1.027918
mean(co_per_cell_m2_spret$count)/19
## [1] 0.9175439
remove(m2_spret_df) # will read this file in later but it is the same
remove(m2_spret_upd) # will read this file in later but it is the same
p <- all_m2_merge %>%
rowwise() %>%
mutate(n_na=max(0, 19 - n_m2_mt - n_m2_upd_mt - n_m2_me - n_m2_upd_me - n_drop0)) %>%
mutate(SORT1=n_m2_mt + n_m2_upd_mt) %>%
mutate(n_mt=n_m2_mt + n_m2_upd_mt, n_me=n_m2_me + n_m2_upd_me) %>%
arrange(n_mt, n_drop0, n_m2_upd_mt, desc(n_me), n_m2_upd_me) %>%
# arrange(SORT1, n_m2_upd_mt, n_m2_mt, n_drop0, n_m2_upd_me, desc(n_m2_me)) %>%
mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
gather(Category, Count, c(2,3,5,6,7)) %>%
mutate(Category=factor(Category, levels=c("n_m2_me", "n_m2_upd_me", "n_drop0", "n_m2_mt", "n_m2_upd_mt"))) %>%
ggplot(aes(CELL_ID, Count, fill=Category)) +
geom_bar(stat='identity', colour=NA, width=0.8) +
scale_fill_manual(values=c("n_m2_me"="mistyrose", "n_m2_upd_me"="firebrick1", "n_m2_mt"="darkolivegreen1", "n_m2_upd_mt"="deepskyblue", "n_drop0"="black")) +
theme_void() +
ggtitle("fig 4E")
p
if (!is.na(fig_dir)) ggsave("fig4E.pdf", p, "pdf", fig_dir, width=10, height=4)
B6/Cast, same format as Fig 2
n_mt_cast <- read_tsv(slfile("cast/number_mt.cast.tab"))
n_class1 <- c(n_mt_cast$class1_1, n_mt_cast$class1_2) %>% na.omit() %>% as.vector()
n_class2 <- n_mt_cast$class2 %>% na.omit() %>% as.vector()
n_class3 <- n_mt_cast$class3 %>% na.omit() %>% as.vector()
Fig.4 format, two figures, 1) binomial(19, 0.5); 2) class 2
p_obs <- tibble(CELL_ID=as.character(seq_len(length(n_class2))), n_mt=n_class2) %>%
mutate(n_me=19 - n_mt) %>%
arrange(desc(n_me)) %>%
mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
gather(Category, Count, c("n_me", "n_mt")) %>%
mutate(Category=factor(Category, levels=c("n_me", "n_mt"))) %>%
ggplot(aes(CELL_ID, Count, fill=Category)) +
geom_bar(stat='identity', colour=NA, width=0.8) +
scale_fill_manual(values=c("n_mt"="deepskyblue", "n_me"="firebrick1")) +
theme_void() +
ggtitle("Observed: class 2")
# simulate null distribution with p = 0.5
set.seed(12345)
p_null <- tibble(CELL_ID=as.character(seq_len(length(n_class2))), n_mt=rbinom(length(n_class2), 19, 0.5)) %>%
mutate(n_me=19 - n_mt) %>%
arrange(desc(n_me)) %>%
mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
gather(Category, Count, c("n_me", "n_mt")) %>%
mutate(Category=factor(Category, levels=c("n_me", "n_mt"))) %>%
ggplot(aes(CELL_ID, Count, fill=Category)) +
geom_bar(stat='identity', colour=NA, width=0.8) +
scale_fill_manual(values=c("n_mt"="deepskyblue", "n_me"="firebrick1")) +
theme_void() +
ggtitle("Null: p=0.5")
p_null
p_obs
if (!is.na(fig_dir)) {
pdf(file.path(fig_dir, "fig5A_5B.pdf"), width=10, height=4)
print(p_null)
print(p_obs)
dev.off()
}
## quartz_off_screen
## 2
Fig.4 format, three figures, 1) class1_1 and class1_2 combined, 2) class1_1 only, 3) class1_1 only, break down pink and light green
# The hard way to do 2) class1_1 only since there's a column of mt in n_mt already
# But this is the way to do 3) class1_1 only, break down pink and light green
# count chromosomes per cell that are me or mt in m2 and hp separately
# me cells
me_per_cell_m2_cast <- m2_cast_df %>%
filter(SEGREGATION %in% c("me", "hp")) %>%
distinct(CELL_ID, CHROM) %>%
count(CELL_ID)
me_per_cell_m2_cast_upd <- m2_cast_upd %>%
filter(STRAIN %in% c("cast", "b6")) %>%
distinct(CELL_ID, CHROM) %>%
count(CELL_ID)
drop0_per_cell <- m2_cast_upd %>%
filter(STRAIN %in% c("drop0")) %>%
distinct(CELL_ID, CHROM) %>%
count(CELL_ID) %>%
dplyr::rename(n_drop0=n)
me_per_cell_m2_merge_cast <- full_join(me_per_cell_m2_cast, me_per_cell_m2_cast_upd, by="CELL_ID", suffix=c("_m2","_m2_upd")) %>%
full_join(drop0_per_cell, drop0_per_cell, by="CELL_ID") %>%
replace_na(list(n_m2=0, n_m2_upd=0, n_drop0=0)) %>%
mutate(SUM_m2=n_m2+n_m2_upd+n_drop0)
# mt cells
mt_per_cell_m2_cast <- m2_cast_df %>%
filter(SEGREGATION=="mt") %>%
distinct(CELL_ID, CHROM) %>%
count(CELL_ID)
mt_per_cell_m2_cast_upd <- m2_cast_upd %>%
filter(STRAIN %in% c("het")) %>%
distinct(CELL_ID, CHROM) %>%
count(CELL_ID)
mt_per_cell_m2_merge_cast <- full_join(mt_per_cell_m2_cast, mt_per_cell_m2_cast_upd, by="CELL_ID", suffix=c("_m2","_m2_upd")) %>%
replace_na(list(n_m2=0, n_m2_upd=0)) %>%
mutate(SUM_m2=n_m2+n_m2_upd)
all_m2_merge_cast <- full_join(mt_per_cell_m2_merge_cast, me_per_cell_m2_merge_cast, by="CELL_ID", suffix=c("_mt","_me")) %>%
replace_na(list(n_m2_mt=0, n_m2_me=0, n_m2_upd_mt=0, n_m2_upd_me=0, SUM_m2_mt=0, SUM_m2_me=0)) %>%
mutate(SUM_m2=SUM_m2_mt + SUM_m2_me) %>%
mutate(rate_m2_me=SUM_m2_me / (SUM_m2_mt + SUM_m2_me)) %>%
mutate(bp_m2=n_m2_mt + n_m2_me) %>%
mutate(TYPE=case_when(SUM_m2_mt >= 15 ~ "Mitotic cell",
SUM_m2_me >= 15 ~ "Meiotic cell",
TRUE ~ "Others"))
p_3b2 <- all_m2_merge_cast %>%
rowwise() %>%
mutate(n_na=max(0, 19 - n_m2_mt - n_m2_upd_mt - n_m2_me - n_m2_upd_me - n_drop0)) %>%
mutate(n_mt=n_m2_mt + n_m2_upd_mt, n_me=n_m2_me + n_m2_upd_me) %>%
arrange(n_mt, desc(n_me)) %>%
mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
gather(Category, Count, c("n_me", "n_drop0", "n_mt")) %>%
mutate(Category=factor(Category, levels=c("n_me", "n_drop0", "n_mt"))) %>%
ggplot(aes(CELL_ID, Count, fill=Category)) +
geom_bar(stat='identity', colour=NA, width=0.8) +
scale_fill_manual(values=c("n_mt"="deepskyblue", "n_me"="firebrick1", "n_drop0"="black")) +
theme_void()
p_3b3 <- all_m2_merge_cast %>%
rowwise() %>%
mutate(n_na=max(0, 19 - n_m2_mt - n_m2_upd_mt - n_m2_me - n_m2_upd_me - n_drop0)) %>%
mutate(SORT1=n_m2_mt + n_m2_upd_mt) %>%
mutate(n_mt=n_m2_mt + n_m2_upd_mt, n_me=n_m2_me + n_m2_upd_me) %>%
arrange(n_mt, n_drop0, n_m2_upd_mt, desc(n_me), n_m2_upd_me) %>%
# arrange(SORT1, n_m2_upd_mt, n_m2_mt, n_drop0, n_m2_upd_me, desc(n_m2_me)) %>%
mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
gather(Category, Count, c("n_m2_me", "n_m2_upd_me", "n_drop0", "n_m2_mt", "n_m2_upd_mt")) %>%
mutate(Category=factor(Category, levels=c("n_m2_me", "n_m2_upd_me", "n_drop0", "n_m2_mt", "n_m2_upd_mt"))) %>%
ggplot(aes(CELL_ID, Count, fill=Category)) +
geom_bar(stat='identity', colour=NA, width=0.8) +
scale_fill_manual(values=c("n_m2_me"="mistyrose", "n_m2_upd_me"="firebrick1", "n_m2_mt"="darkolivegreen1", "n_m2_upd_mt"="deepskyblue", "n_drop0"="black")) +
theme_void()
# 1) class1_1 and class1_2 combined, blue and red like p_3b2
n_class1_1 <- n_mt_cast$class1_1 %>% na.omit() %>% as.vector()
n_class1_2 <- n_mt_cast$class1_2 %>% na.omit() %>% as.vector()
p_3b1 <- all_m2_merge_cast %>%
rowwise() %>%
mutate(n_na=max(0, 19 - n_m2_mt - n_m2_upd_mt - n_m2_me - n_m2_upd_me - n_drop0)) %>%
mutate(n_mt=n_m2_mt + n_m2_upd_mt, n_me=n_m2_me + n_m2_upd_me) %>%
# combine the class1_2 mt counts by myself from n_mt
bind_rows(
tibble(CELL_ID=as.character(seq_len(length(n_class1_2))), n_mt=n_class1_2) %>%
mutate(n_me=19 - n_mt)) %>%
arrange(n_mt, desc(n_me)) %>%
mutate(CELL_ID=factor(CELL_ID, levels=rev(unique(.$CELL_ID)))) %>%
gather(Category, Count, c("n_me", "n_drop0", "n_mt")) %>%
mutate(Category=factor(Category, levels=c("n_me", "n_drop0", "n_mt"))) %>%
ggplot(aes(CELL_ID, Count, fill=Category)) +
geom_bar(stat='identity', colour=NA, width=0.8) +
scale_fill_manual(values=c("n_mt"="deepskyblue", "n_me"="firebrick1", "n_drop0"="black")) +
theme_void()
p_3b1
p_3b2
p_3b3
if (!is.na(fig_dir)) {
pdf(file.path(fig_dir, "fig5CDE.pdf"), 10, 4)
print(p_3b1)
print(p_3b2)
print(p_3b3)
dev.off()
}
## quartz_off_screen
## 2
Number of mt versus fitted mixture model, 1) class2; 2) class1; 3) class3
Not run when knitting Rmd due to time.
We fit the data to a mixture of three binomial distributions parameterized by p_1, p_2, p_3, respectively, denoting their probabilities of chromosomes segregating equationally. The relative contribution of these three binomial distributions are denoted by a length 3 vector theta. We estimate p_1, p_2, p_3 as well as theta by drawing samples from their posterior distributions using Stan with uniform Dirichlet prior for theta: theta ~ Dir(K=3, alpha=1), and beta prior for p: p ~ Beta(a=5, b=5).
# try fitting mixture of binomials for number of events
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
set.seed(12345)
dat1 <- list(N=length(n_class1), y=n_class1, n_groups=3)
fit1 <- stan(file = system.file("mt_mixture_model.stan", package="sciliantifig"), data = dat1,
iter = 1000, chains = 4)
dat2 <- list(N=length(n_class2), y=n_class2, n_groups=3)
fit2 <- stan(file = system.file("mt_mixture_model.stan", package="sciliantifig"), data = dat2,
iter = 1000, chains = 4)
dat3 <- list(N=length(n_class3), y=n_class3, n_groups=3)
fit3 <- stan(file = system.file("mt_mixture_model.stan", package="sciliantifig"), data = dat3,
iter = 1000, chains = 4)
# save(fit1, fit2, fit3, file="inst/extdata/mt_mixture_model_fits.rda")
print(fit1)
print(fit2)
print(fit3)
Continue and make plots for Fig S4A, S4B, S4C
##
par1 <- rstan::extract(fit1)
thetas <- colMeans(par1$Theta)
sizes <- round(100000 * thetas)
fit1_samples <- map_df(set_names(1:3), function(i) {
tibble("Sample"=rbinom(sizes[i], 19, par1$p[, i]))
}, .id="Group")
p1_fitted <- ggplot(fit1_samples, aes(Sample, fill=Group)) +
geom_bar(aes(y = (..count..)/sum(..count..)), alpha=0.5) +
scale_fill_manual(name="Mixing component", values=c("orangered", "grey80", "dodgerblue")) +
xlab("# of equational chromosome segregations") +
ylab("Estimated fraction\n") +
theme_classic() +
# theme(legend.justification = c(1, 1), legend.position = c(1, 1)) +
NULL
p1_obs <- data.frame(n_mt=n_class1) %>%
count(n_mt) %>%
right_join(tibble(n_mt=0:19)) %>%
replace_na(list(n=0)) %>%
mutate(n_me = 19 - n_mt) %>%
mutate(Group=case_when(n_mt >= 15 ~ "Mitotic",
n_me >= 15 ~ "Meiotic",
TRUE ~ "Others")) %>%
# mutate(n_mt_fct = factor(.$n_mt, levels=as.character(0:19))) %>%
ggplot(aes(n_mt, n, fill=Group)) +
geom_bar(stat="identity") +
scale_fill_manual(name="Mixing component", values=c("Meiotic"="orangered", "Mitotic"="dodgerblue", "Others"="grey80")) +
xlab("# of equational chromosome segregations") +
ylab("# of cells\n") +
theme_classic() +
# theme(legend.justification = c(1, 1), legend.position = c(1, 1)) +
NULL
##
par2 <- rstan::extract(fit2)
thetas <- colMeans(par2$Theta)
sizes <- round(100000 * thetas)
fit2_samples <- map_df(set_names(1:3), function(i) {
tibble("Sample"=rbinom(sizes[i], 19, par2$p[, i]))
}, .id="Group")
p2_fitted <- ggplot(fit2_samples, aes(Sample, fill=Group)) +
geom_bar(aes(y = (..count..)/sum(..count..)), alpha=0.5) +
scale_fill_manual(name="Mixing component", values=c("orangered", "grey80", "dodgerblue")) +
xlab("# of equational chromosome segregations") +
ylab("Estimated fraction\n") +
theme_classic()
p2_obs <- data.frame(n_mt=n_class2) %>%
count(n_mt) %>%
right_join(tibble(n_mt=0:19)) %>%
replace_na(list(n=0)) %>%
mutate(n_me = 19 - n_mt) %>%
mutate(Group=case_when(n_mt >= 15 ~ "Mitotic",
n_me >= 15 ~ "Meiotic",
TRUE ~ "Others")) %>%
# mutate(n_mt_fct = factor(.$n_mt, levels=as.character(0:19))) %>%
ggplot(aes(n_mt, n, fill=Group)) +
geom_bar(stat="identity") +
scale_fill_manual(name="Mixing component", values=c("Meiotic"="orangered", "Mitotic"="dodgerblue", "Others"="grey80")) +
xlab("# of equational chromosome segregations") +
ylab("# of cells\n") +
theme_classic()
##
par3 <- rstan::extract(fit3)
thetas <- colMeans(par3$Theta)
sizes <- round(100000 * thetas)
fit3_samples <- map_df(set_names(1:3), function(i) {
tibble("Sample"=rbinom(sizes[i], 19, par3$p[, i]))
}, .id="Group")
p3_fitted <- ggplot(fit3_samples, aes(Sample, fill=Group)) +
geom_bar(aes(y = (..count..)/sum(..count..)), alpha=0.5) +
scale_fill_manual(name="Mixing component", values=c("orangered", "grey80", "dodgerblue")) +
xlab("# of equational chromosome segregations") +
ylab("Estimated fraction\n") +
theme_classic()
p3_obs <- data.frame(n_mt=n_class3) %>%
count(n_mt) %>%
right_join(tibble(n_mt=0:19)) %>%
replace_na(list(n=0)) %>%
mutate(n_me = 19 - n_mt) %>%
mutate(Group=case_when(n_mt >= 15 ~ "Mitotic",
n_me >= 15 ~ "Meiotic",
TRUE ~ "Others")) %>%
# mutate(n_mt_fct = factor(.$n_mt, levels=as.character(0:19))) %>%
ggplot(aes(n_mt, n, fill=Group)) +
geom_bar(stat="identity") +
scale_fill_manual(name="Mixing component", values=c("Meiotic"="orangered", "Mitotic"="dodgerblue", "Others"="grey80")) +
xlab("# of equational chromosome segregations") +
ylab("# of cells\n") +
theme_classic()
if (!is.na(fig_dir)) {
pdf(file.path(fig_dir, "figS4A.pdf"), 4, 5)
grid::grid.draw(rbind(ggplotGrob(p1_fitted), ggplotGrob(p1_obs), size="last"))
dev.off()
pdf(file.path(fig_dir, "figS4B.pdf"), 4, 5)
grid::grid.draw(rbind(ggplotGrob(p2_fitted), ggplotGrob(p2_obs), size="last"))
dev.off()
pdf(file.path(fig_dir, "figS4C.pdf"), 4, 5)
grid::grid.draw(rbind(ggplotGrob(p3_fitted), ggplotGrob(p3_obs), size="last"))
dev.off()
}
mm10_fai_df <- read_tsv(slfile("mm10.fa.fai"), col_names=c("CHROM", "CHROM_SIZE", "BI", "BASES_PER_LINE", "BYTES_PER_LINE"))
m2_spret_df <- read_tsv(slfile("m2diploid.concensus.mb.spret.tab"))
m2_spret_upd <- read_tsv(slfile("m2.upd.spret.tab"))
Number of haploid break points normalized by chromosome size ~ chromosome size
df <- hb_spret_df %>%
count(CHROM) %>%
left_join(mm10_fai_df, by="CHROM") %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
mutate(CHROM_SIZE=CHROM_SIZE / 1e6) %>%
mutate(CHROM=gsub("chr", "", CHROM)) %>%
mutate(NORM_N=n / length(unique(hb_spret_df$CELL_ID)) / CHROM_SIZE * 2 * 100)
p <- ggplot(df, aes(CHROM_SIZE, NORM_N)) +
geom_point(alpha=0.7) +
geom_text_repel(aes(label=CHROM)) +
xlab("Chromosome size (Mb)") +
ylab("Crossover density (cM / Mb)") +
theme_classic()
p
r <- cor.test(df$NORM_N, df$CHROM_SIZE, method="pearson", exact=TRUE)
cat("pearson r:\n", r$estimate, "\np-value:\n", r$p.value, "\n")
## pearson r:
## -0.6624745
## p-value:
## 0.001997309
if (!is.na(fig_dir)) ggsave("figS5A.pdf", p, "pdf", fig_dir, width=3, height=3)
Number of M2 diploid break points normalized by chromosome size ~ chromosome size
df <- m2_spret_df %>%
count(CHROM) %>%
left_join(mm10_fai_df, by="CHROM") %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
mutate(CHROM_SIZE=CHROM_SIZE / 1e6) %>%
mutate(CHROM=gsub("chr", "", CHROM)) %>%
mutate(NORM_N=n / length(unique(m2_spret_df$CELL_ID)) / CHROM_SIZE * 100)
p <- ggplot(df, aes(CHROM_SIZE, NORM_N)) +
geom_point(alpha=0.7) +
geom_text_repel(aes(label=CHROM)) +
xlab("Chromosome size (Mb)") +
ylab("Crossover density (cM / Mb)") +
theme_classic()
p
r <- cor.test(df$NORM_N, df$CHROM_SIZE, method="pearson", exact=TRUE)
cat("pearson r:\n", r$estimate, "\np-value:\n", r$p.value, "\n")
## pearson r:
## -0.8328771
## p-value:
## 9.606707e-06
if (!is.na(fig_dir)) ggsave("figS5B.pdf", p, "pdf", fig_dir, width=3, height=3)
Number of haploid break points (at least one on a chromosome) normalized by chromosome size ~ chromosome size
df <- hb_spret_df %>%
distinct(CELL_ID, CHROM) %>%
count(CHROM) %>%
left_join(mm10_fai_df, by="CHROM") %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
mutate(CHROM_SIZE=CHROM_SIZE / 1e6) %>%
mutate(CHROM=gsub("chr", "", CHROM)) %>%
mutate(NORM_N=n / length(unique(hb_spret_df$CELL_ID)) / CHROM_SIZE * 2 * 100)
p <- ggplot(df, aes(CHROM_SIZE, NORM_N)) +
geom_point(alpha=0.7) +
geom_text_repel(aes(label=CHROM)) +
xlab("Chromosome size (Mb)") +
ylab("Crossover density (alt_cM / Mb)") +
theme_classic()
p
r <- cor.test(df$NORM_N, df$CHROM_SIZE, method="pearson", exact=TRUE)
cat("pearson r:\n", r$estimate, "\np-value:\n", r$p.value, "\n")
## pearson r:
## -0.8667679
## p-value:
## 1.579717e-06
if (!is.na(fig_dir)) ggsave("figS6A.pdf", p, "pdf", fig_dir, width=3, height=3)
Number of M2 diploid break points (at least one on a chromosome) normalized by chromosome size ~ chromosome size
df <- m2_spret_df %>%
distinct(CELL_ID, CHROM) %>%
count(CHROM) %>%
left_join(mm10_fai_df, by="CHROM") %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
mutate(CHROM_SIZE=CHROM_SIZE / 1e6) %>%
mutate(CHROM=gsub("chr", "", CHROM)) %>%
mutate(NORM_N=n / length(unique(m2_spret_df$CELL_ID)) / CHROM_SIZE * 100)
p <- ggplot(df, aes(CHROM_SIZE, NORM_N)) +
geom_point(alpha=0.7) +
geom_text_repel(aes(label=CHROM)) +
xlab("Chromosome size (Mb)") +
ylab("Crossover density (alt_cM / Mb)") +
theme_classic()
p
r <- cor.test(df$NORM_N, df$CHROM_SIZE, method="pearson", exact=TRUE)
cat("pearson r:\n", r$estimate, "\np-value:\n", r$p.value, "\n")
## pearson r:
## -0.9079146
## p-value:
## 7.905036e-08
if (!is.na(fig_dir)) ggsave("figS6B.pdf", p, "pdf", fig_dir, width=3, height=3)
Need cast data in addition to spretus here
co_per_chr_hb_spret_df <- hb_spret_df %>%
group_by(CELL_ID, CHROM) %>%
summarise(N_BREAKS_CHROM=n()) %>%
ungroup() %>%
right_join(expand.grid(unique(hb_spret_df$CELL_ID), setdiff(unique(hb_spret_df$CHROM), c("chrX", "chrY")), stringsAsFactors=FALSE), by=c("CELL_ID"="Var1", "CHROM"="Var2")) %>%
replace_na(list(N_BREAKS_CHROM=0L))
co_per_chr_hb_cast_df <- hb_cast_df %>%
group_by(CELL_ID, CHROM) %>%
summarise(N_BREAKS_CHROM=n()) %>%
ungroup() %>%
right_join(expand.grid(unique(hb_cast_df$CELL_ID), setdiff(unique(hb_cast_df$CHROM), c("chrX", "chrY")), stringsAsFactors=FALSE), by=c("CELL_ID"="Var1", "CHROM"="Var2")) %>%
replace_na(list(N_BREAKS_CHROM=0L))
df <- count(co_per_chr_hb_spret_df, N_BREAKS_CHROM) %>%
mutate(Strain="B6/Spret") %>%
mutate(freq=n/sum(n)) %>%
bind_rows(count(co_per_chr_hb_cast_df, N_BREAKS_CHROM) %>%
mutate(Strain="B6/Cast") %>%
mutate(freq=n/sum(n)))
p_freq <- ggplot(df, aes(N_BREAKS_CHROM, freq, group=Strain)) +
geom_bar(aes(fill=Strain), stat='identity', position=position_dodge()) +
scale_fill_manual(values=c("orangered", "coral")) +
xlab("# of CO per chr") +
ylab("Frequency") +
scale_x_continuous(breaks=c(0:5), limits=c(-0.5, 5)) +
theme_classic()
p_n <- ggplot(df, aes(N_BREAKS_CHROM, n, group=Strain)) +
geom_bar(aes(fill=Strain), stat='identity', position=position_dodge()) +
scale_fill_manual(values=c("orangered", "coral")) +
xlab("# of CO per chr") +
ylab("Count") +
scale_x_continuous(breaks=c(0:5), limits=c(-0.5, 5)) +
theme_classic()
p_freq
p_n
if (!is.na(fig_dir)) ggsave("figS5C.pdf", p_freq, "pdf", fig_dir, width=4, height=2.5)
if (!is.na(fig_dir)) ggsave("figS6C.pdf", p_n, "pdf", fig_dir, width=4, height=2.5)
co_per_chr_m2_spret_df <- m2_spret_df %>%
group_by(CELL_ID, CHROM) %>%
summarise(N_BREAKS_CHROM=n()) %>%
ungroup() %>%
right_join(expand.grid(unique(m2_spret_df$CELL_ID), setdiff(unique(m2_spret_df$CHROM), c("chrX", "chrY")), stringsAsFactors=FALSE), by=c("CELL_ID"="Var1", "CHROM"="Var2")) %>%
replace_na(list(N_BREAKS_CHROM=0L))
co_per_chr_m2_cast_df <- m2_cast_df %>%
group_by(CELL_ID, CHROM) %>%
summarise(N_BREAKS_CHROM=n()) %>%
ungroup() %>%
right_join(expand.grid(unique(m2_cast_df$CELL_ID), setdiff(unique(m2_cast_df$CHROM), c("chrX", "chrY")), stringsAsFactors=FALSE), by=c("CELL_ID"="Var1", "CHROM"="Var2")) %>%
replace_na(list(N_BREAKS_CHROM=0L))
df <- count(co_per_chr_m2_spret_df, N_BREAKS_CHROM) %>%
mutate(Strain="B6/Spret") %>%
mutate(freq=n/sum(n)) %>%
bind_rows(count(co_per_chr_m2_cast_df, N_BREAKS_CHROM) %>%
mutate(Strain="B6/Cast") %>%
mutate(freq=n/sum(n)))
p_freq <- ggplot(df, aes(N_BREAKS_CHROM, freq, group=Strain)) +
geom_bar(aes(fill=Strain), stat='identity', position=position_dodge()) +
scale_fill_manual(values=c("dodgerblue", "deepskyblue")) +
xlab("# of CO per chr") +
ylab("Frequency") +
scale_x_continuous(breaks=c(0:5), limits=c(-0.5, 5)) +
theme_classic()
p_n <- ggplot(df, aes(N_BREAKS_CHROM, n, group=Strain)) +
geom_bar(aes(fill=Strain), stat='identity', position=position_dodge()) +
scale_fill_manual(values=c("dodgerblue", "deepskyblue")) +
xlab("# of CO per chr") +
ylab("Count") +
scale_x_continuous(breaks=c(0:5), limits=c(-0.5, 5)) +
theme_classic()
p_freq
p_n
if (!is.na(fig_dir)) ggsave("figS5D.pdf", p_freq, "pdf", fig_dir, width=4, height=2.5)
if (!is.na(fig_dir)) ggsave("figS6D.pdf", p_n, "pdf", fig_dir, width=4, height=2.5)
all_spret_upd <- read_tsv(slfile("all.upd.spret.tab"))
all_spret_upd_w_cellstatus <- all_spret_upd %>%
mutate(CELL_STATUS=case_when(CELL_ID %in% hb_spret_df$CELL_ID ~ "Haploid",
CELL_ID %in% m2_spret_df$CELL_ID ~ "M2 diploid",
TRUE ~ "Other"))
# Uniparental chromosome dist for M2 diploid cells
m2_spret_hist_input <- m2_spret_upd %>%
group_by(CELL_ID) %>%
summarise(count=sum(STRAIN %in% c("spret","b6"))) %>%
group_by(count) %>%
summarise(number=n())
m2_spret_hist_input[1,2] <- m2_spret_hist_input[1,2] + 41L
# 41 cells with crossover on every chromosome (240 - 199)
# Uniparental chromosome dist for haploid cells
all_spret_upd_chr_per_cell_dist <- all_spret_upd %>%
left_join(haploid_spret_upd, by=c("CELL_ID", "CHROM")) %>%
mutate(CELL_STATUS=case_when(CELL_ID %in% m2_spret_df$CELL_ID ~ "M2 diploid",
CELL_ID %in% hb_spret_df$CELL_ID ~ "Haploid",
TRUE ~ "Other")) %>%
mutate(UPD_STATUS=case_when(STRAIN %in% c("b6", "spret") ~ "UPD",
TRUE ~ "NOUPD")) %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
group_by(CELL_ID, CELL_STATUS) %>%
summarise(N_UPD_CHR=sum(UPD_STATUS == "UPD")) %>%
group_by(CELL_STATUS, N_UPD_CHR) %>%
summarise(N_CELL=n()) %>%
mutate(N_UPD_CHR=as.integer(N_UPD_CHR), N_CELL=as.integer(N_CELL))
all_spret_upd_chr_per_cell_dist[18, 2:3] <- c(0L, 41L) # doesn't seem to matter because only haploid will be used from this table but anyways...
df <- m2_spret_hist_input %>%
mutate(CELL_STATUS="M2 diploid") %>%
dplyr::rename(N_CELL=number, N_UPD_CHR=count) %>%
bind_rows(all_spret_upd_chr_per_cell_dist %>%
filter(CELL_STATUS == "Haploid")) %>%
bind_rows(all_spret_upd_w_cellstatus %>%
filter(CELL_STATUS == "Other") %>%
group_by(CELL_ID) %>%
summarise(N_UPD_CHR=sum(UPD_STATUS != "OTHER")) %>%
group_by(N_UPD_CHR) %>%
summarise(N_CELL=n()) %>%
mutate(CELL_STATUS="Other"))
p <- ggplot(df, aes(N_UPD_CHR, N_CELL)) +
geom_bar(stat='identity') +
xlab("# of uniparental chromosomes") +
ylab("# of cells") +
facet_wrap(~CELL_STATUS, scales='free_y') +
theme_classic() +
theme(strip.background = element_blank())
p
if (!is.na(fig_dir)) ggsave("figS5F.pdf", p, "pdf", fig_dir, width=4, height=1.5)
all_spret_upd_chr_dist_other <- all_spret_upd_w_cellstatus %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
filter(CELL_STATUS=="Other") %>%
group_by(CELL_STATUS, CHROM) %>%
summarise(N_CELL_WITH_UPD_CHR=sum(UPD_STATUS %in% c("REF", "ALT")))
all_spret_upd_chr_dist_hp <- haploid_spret_upd %>%
mutate(CELL_STATUS="Haploid") %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
group_by(CELL_STATUS, CHROM) %>%
summarise(N_CELL_WITH_UPD_CHR=sum(STRAIN %in% c("spret", "b6")))
all_spret_upd_chr_dist_m2 <- m2_spret_upd %>%
mutate(CELL_STATUS="M2 diploid") %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
group_by(CELL_STATUS, CHROM) %>%
summarise(N_CELL_WITH_UPD_CHR=sum(STRAIN %in% c("spret", "b6")))
all_spret_upd_chr_dist<-bind_rows(all_spret_upd_chr_dist_hp, all_spret_upd_chr_dist_m2, all_spret_upd_chr_dist_other) %>%
mutate(CHROM=as.integer(gsub("chr", "", CHROM)))
p <- ggplot(all_spret_upd_chr_dist, aes(CHROM, N_CELL_WITH_UPD_CHR)) +
geom_bar(stat='identity') +
xlab("Chromosome") +
ylab("# of cells") +
scale_x_continuous(breaks=c(1, 5, 10, 15, 19)) +
facet_wrap(~CELL_STATUS, scales='free_y') +
theme_classic() +
theme(strip.background = element_blank())
p
if (!is.na(fig_dir)) ggsave("figS5G.pdf", p, "pdf", fig_dir, width=4, height=1.5)
# sampled null tracts, use them to calculate null distribution of co distances
hb_spret_ctrl_df <- read_tsv(slfile("hb_sample.01.bed"), col_names=c("CHROM", "START", "END"))
m2_spret_ctrl_df <- read_tsv(slfile("m2-sampled-track_01.spret.tab"))
bp_spret_ctrl_df <- bind_rows(hb_spret_ctrl_df, m2_spret_ctrl_df)
# true distances
bp_spret_df <- bind_rows(hb_spret_df %>% mutate(TYPE="haploid"),
m2_spret_df %>% mutate(TYPE="m2 diploid"))
bp_spret_distance_df <- map_df(set_names(unique(bp_spret_df$TYPE)), function(type) {
map_df(set_names(setdiff(unique(bp_spret_df$CHROM), c("chrX", "chrY"))), function(chr) {
chr_df <- filter(bp_spret_df, CHROM == chr & TYPE == type)
map_df(set_names(unique(chr_df$CELL_ID)), function(cell) {
cc_df <- filter(chr_df, CELL_ID == cell)
if (nrow(cc_df) < 2) return(NULL)
dists <- c()
for (i in seq_len(nrow(cc_df) - 1)) {
j <- i + 1 # use distance of adjacent breakpoints only
dists <- c(dists, bp_distance(cc_df[[i, "START"]], cc_df[[i, "END"]], cc_df[[j, "START"]], cc_df[[j, "END"]]))
}
return(tibble(DISTANCE=dists))
}, .id="CELL_ID")
}, .id="CHROM")
}, .id="TYPE")
bp_spret_distance_df_for_plot <- bp_spret_distance_df %>%
mutate(CHROM="ALL") %>%
bind_rows(bp_spret_distance_df)
bp_spret_distance_df_for_plot$CHROM <- factor(bp_spret_distance_df_for_plot$CHROM, levels=c(chr_levels, "ALL"))
sizes <- table(bp_spret_distance_df$CHROM)
bp_spret_ctrl_distance_df <- map_df(names(sizes), function(c) {
choices <- bp_spret_ctrl_df %>%
filter(CHROM == c)
idx1 <- sample(seq_len(nrow(choices)))
idx2 <- sample(seq_len(nrow(choices)))
ret <- tibble()
sampled_size <- 0
while (sampled_size < sizes[c]) {
sampled_choices <- bind_cols(choices[idx1, ], choices[idx2, ]) %>%
filter((START < START1 & END < START1) | (START > END1 & START > START1)) %>%
rowwise() %>%
mutate(DISTANCE=bp_distance(START, END, START1, END1))
ret <- distinct(bind_rows(ret, sampled_choices))
ret <- filter(ret, DISTANCE > 1e5) # min_block size in call_state_block is 1e5 so
sampled_size <- nrow(ret)
}
return(ret[1:sizes[c], ])
})
bp_spret_ctrl_distance_df_for_plot <- bind_rows(bp_spret_ctrl_distance_df %>% mutate(CHROM="ALL"),
bp_spret_ctrl_distance_df)
p_4e <- bind_rows(bp_spret_distance_df_for_plot %>% mutate(TYPE=case_when(TYPE=="haploid" ~ "Haploid", TYPE=="m2 diploid" ~ "M2 diploid")),
bp_spret_ctrl_distance_df_for_plot %>% mutate(TYPE="Expected")) %>%
mutate(CHROM=factor(CHROM, levels=c(chr_levels, "ALL"))) %>%
filter(CHROM %in% c("chr1", "chr2", "chr12", "chr13")) %>% # uncomment if fig 4e
ggplot(aes(DISTANCE / 1e6, fill=TYPE)) +
geom_histogram(position="dodge", bins=20) +
scale_fill_manual(name="", values=c("Haploid"="orangered", "M2 diploid"="dodgerblue", "Expected"="grey80")) +
facet_wrap(~CHROM, scales="free") +
xlab("Distance between break points (M)") +
ylab("Count") +
theme_classic() +
theme(strip.background = element_blank())
p_s5e <- bind_rows(bp_spret_distance_df_for_plot %>% mutate(TYPE=case_when(TYPE=="haploid" ~ "Haploid", TYPE=="m2 diploid" ~ "M2 diploid")),
bp_spret_ctrl_distance_df_for_plot %>% mutate(TYPE="Expected")) %>%
mutate(CHROM=factor(CHROM, levels=c(chr_levels, "ALL"))) %>%
# filter(CHROM %in% c("chr1", "chr2", "chr12", "chr13")) %>% # uncomment if fig 4e
ggplot(aes(DISTANCE / 1e6, fill=TYPE)) +
geom_histogram(position="dodge", bins=20) +
scale_fill_manual(name="", values=c("Haploid"="orangered", "M2 diploid"="dodgerblue", "Expected"="grey80")) +
facet_wrap(~CHROM, scales="free") +
xlab("Distance between break points (M)") +
ylab("Count") +
theme_classic() +
theme(strip.background = element_blank())
p_4e
p_s5e
# observed distance mean and median, both haploid and m2 diploid combined
bp_spret_distance_df %>%
summarise(DISTANCE_MEAN=mean(DISTANCE), DISTANCE_MEDIAN=median(DISTANCE))
## # A tibble: 1 x 2
## DISTANCE_MEAN DISTANCE_MEDIAN
## <dbl> <dbl>
## 1 80581864. 82033170.
# simulated null distance mean and median, both haploid and m2 diploid combined
bp_spret_ctrl_distance_df %>%
summarise(DISTANCE_MEAN=mean(DISTANCE), DISTANCE_MEDIAN=median(DISTANCE))
## # A tibble: 1 x 2
## DISTANCE_MEAN DISTANCE_MEDIAN
## <dbl> <dbl>
## 1 47322986. 40138581
# wilcoxon test for distance
w <- wilcox.test(bp_spret_distance_df$DISTANCE, bp_spret_ctrl_distance_df$DISTANCE)
w$p.value
## [1] 3.774328e-271
if (!is.na(fig_dir)) ggsave("figS5E.pdf", p_4e, "pdf", fig_dir, width=5, height=4)
if (!is.na(fig_dir)) ggsave("figS6E.pdf", p_s5e, "pdf", fig_dir, width=8, height=4)
spret_all_cell_status <- read_tsv(slfile("all.cellstatus.spret.tab"), col_names=c("CELL_ID", "CELL_STATUS", "B", "C"))
spret_mt_prop_df <- read_tsv(slfile("all.MT.prop.summary.spret.tab"), col_names=c("CELL_ID", "PROP_MT"))
spret_mt_prop_cell_status <- left_join(spret_mt_prop_df, spret_all_cell_status, by="CELL_ID") %>%
mutate(CELL_STATUS=case_when(CELL_ID %in% m2_spret_df$CELL_ID ~ "M2 diploid",
CELL_ID %in% hb_spret_df$CELL_ID ~ "Haploid",
TRUE ~ "Other"))
spret_mt_prop_m2d_status <- spret_mt_prop_cell_status %>%
filter(CELL_STATUS == "M2 diploid") %>%
mutate(CELL_STATUS=case_when(CELL_ID %in% filter(all_m2_merge, TYPE == "Meiotic cell")$CELL_ID ~ "M2 diploid (meiotic)",
CELL_ID %in% filter(all_m2_merge, TYPE == "Mitotic cell")$CELL_ID ~ "M2 diploid (mitotic)",
TRUE ~ "M2 diploid (other)"))
cell_status_levels <- c("Haploid", "M2 diploid", "Other", "M2 diploid (meiotic)", "M2 diploid (mitotic)", "M2 diploid (other)")
p <- ggplot(bind_rows(spret_mt_prop_cell_status, spret_mt_prop_m2d_status) %>% mutate(CELL_STATUS=factor(CELL_STATUS, levels=cell_status_levels)), aes(PROP_MT + 0.1)) +
geom_histogram(bins=50) +
scale_x_log10(limits=c(0.05, max(spret_mt_prop_cell_status$PROP_MT))) +
facet_wrap(~CELL_STATUS, scales='free', nrow=2) +
xlab("Proportion of mitochondria reads (normalized)") +
ylab("Count") +
theme_classic() +
theme(strip.background = element_blank())
p
if (!is.na(fig_dir)) ggsave("figS5I_S6G.pdf", p, "pdf", fig_dir, width=5, height=3.5)
chromosome distribution for “reverse segregation” (for all TYPE=“Meiotic cell” defined in “all_m2_merge”, sum up SEGREGATION=“mt” in m2_spret_df and STRAIN=“het” in m2_spret_upd, aggregate by chromosome, generate a df similar to “all_upd_chr_dist_m2”, call it “meiotic_rev_chr_dist_m2_spret”); same for cast.
# for spret
rev_cell_spret_ids <- filter(all_m2_merge, TYPE == "Meiotic cell")$CELL_ID
meiotic_rev_chr_dist_m2_spret <- m2_spret_df %>%
filter(SEGREGATION == "mt") %>%
distinct(CELL_ID, CHROM) %>%
bind_rows(m2_spret_upd %>%
filter(STRAIN == "het")) %>%
filter(CELL_ID %in% rev_cell_spret_ids) %>%
count(CHROM) %>%
mutate(CHROM=as.integer(gsub("chr", "", CHROM)))
p_spret <- ggplot(meiotic_rev_chr_dist_m2_spret, aes(CHROM, n)) +
geom_bar(stat='identity') +
xlab("Chromosome") +
ylab("# of cells") +
scale_x_continuous(breaks=c(1, 5, 10, 15, 19)) +
theme_classic() +
theme(strip.background = element_blank())
# for cast
rev_cell_cast_ids <- filter(all_m2_merge_cast, TYPE == "Meiotic cell")$CELL_ID
meiotic_rev_chr_dist_m2_cast <- m2_cast_df %>%
filter(SEGREGATION == "mt") %>%
distinct(CELL_ID, CHROM) %>%
bind_rows(m2_cast_upd %>%
filter(STRAIN == "het")) %>%
filter(CELL_ID %in% rev_cell_cast_ids) %>%
count(CHROM) %>%
mutate(CHROM=as.integer(gsub("chr", "", CHROM)))
p_cast <- ggplot(meiotic_rev_chr_dist_m2_cast, aes(CHROM, n)) +
geom_bar(stat='identity') +
xlab("Chromosome") +
ylab("# of cells") +
scale_x_continuous(breaks=c(1, 5, 10, 15, 19)) +
theme_classic() +
theme(strip.background = element_blank())
p_spret
p_cast
if (!is.na(fig_dir)) {
pdf(file.path(fig_dir, "figS5H.pdf"), 1.5, 1.5)
print(p_spret)
print(p_cast)
dev.off()
}
## quartz_off_screen
## 2
# test if the number of chromosomes with rev seg per cell is higher in spret than cast
n_revseg_chr_per_cell_spret_df <- m2_spret_df %>%
filter(SEGREGATION == "mt") %>%
distinct(CELL_ID, CHROM) %>%
bind_rows(m2_spret_upd %>%
filter(STRAIN == "het")) %>%
filter(CELL_ID %in% rev_cell_spret_ids) %>%
distinct(CELL_ID, CHROM) %>%
count(CELL_ID) %>%
right_join(tibble(CELL_ID=rev_cell_spret_ids), by="CELL_ID") %>%
replace_na(list(n=0))
n_revseg_chr_per_cell_cast_df <- m2_cast_df %>%
filter(SEGREGATION == "mt") %>%
distinct(CELL_ID, CHROM) %>%
bind_rows(m2_cast_upd %>%
filter(STRAIN == "het")) %>%
filter(CELL_ID %in% rev_cell_cast_ids) %>%
distinct(CELL_ID, CHROM) %>%
count(CELL_ID) %>%
right_join(tibble(CELL_ID=rev_cell_cast_ids), by="CELL_ID") %>%
replace_na(list(n=0))
exactRankTests::wilcox.exact(n_revseg_chr_per_cell_cast_df$n, n_revseg_chr_per_cell_spret_df$n)
##
## Asymptotic Wilcoxon rank sum test
##
## data: n_revseg_chr_per_cell_cast_df$n and n_revseg_chr_per_cell_spret_df$n
## W = 5682.5, p-value = 1.621e-14
## alternative hypothesis: true mu is not equal to 0
lib202_upd <- read_tsv(slfile("lib202_upd.tab"))
# draw this for ref, alt and other
df <- lib202_upd %>%
count(CHROM, UPD_STATUS) %>%
filter(!CHROM %in% c("chr4", "chrX", "chrY")) %>%
mutate(CHROM=as.numeric(gsub("chr", "", CHROM))) %>%
mutate(UPD_STATUS=case_when(UPD_STATUS == "REF" ~ "B6",
UPD_STATUS == "ALT" ~ "Spret",
UPD_STATUS == "OTHER" ~ "Other")) %>%
filter(UPD_STATUS != "Other")
p <- bind_rows(df %>% mutate(UPD_STATUS="B6 + Spret"),
df) %>%
mutate(UPD_STATUS=factor(UPD_STATUS, levels=c("B6", "Spret", "B6 + Spret"))) %>%
ggplot(aes(CHROM, n)) +
geom_bar(stat='identity') +
facet_wrap(~UPD_STATUS, scales="free_y") +
xlab("Chromosome") +
ylab("# of cells") +
scale_x_continuous(breaks=c(1, 5, 10, 15, 19)) +
theme_classic() +
theme(strip.background = element_blank())
p
if (!is.na(fig_dir)) ggsave("figS6F1.pdf", p, "pdf", fig_dir, width=5, height=2)
# similar plot, separating mt, me, mt + me this time
df <- read_tsv(slfile("patski.mb.tab")) %>%
count(CHROM, SEGREGATION) %>%
filter(!CHROM %in% c("chr4", "chrX", "chrY")) %>%
mutate(CHROM=as.numeric(gsub("chr", "", CHROM))) %>%
mutate(SEGREGATION=case_when(SEGREGATION == "mt" ~ "Mitotic",
SEGREGATION == "me" ~ "Meiotic"))
p <- bind_rows(df %>% mutate(SEGREGATION="All LOH"),
df) %>%
mutate(SEGREGATION=factor(SEGREGATION, levels=c("Mitotic", "Meiotic", "All LOH"))) %>%
ggplot(aes(CHROM, n)) +
geom_bar(stat='identity') +
facet_wrap(~SEGREGATION, scales="free_y") +
xlab("Chromosome") +
ylab("# of cells") +
scale_x_continuous(breaks=c(1, 5, 10, 15, 19)) +
theme_classic() +
theme(strip.background = element_blank())
p
if (!is.na(fig_dir)) ggsave("figS6F2.pdf", p, "pdf", fig_dir, width=5, height=2)
Number of haploid break points normalized by chromosome size ~ chromosome size, cast
df <- hb_cast_df %>%
count(CHROM) %>%
left_join(mm10_fai_df, by="CHROM") %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
mutate(CHROM_SIZE=CHROM_SIZE / 1e6) %>%
mutate(CHROM=gsub("chr", "", CHROM)) %>%
mutate(NORM_N=n / length(unique(hb_spret_df$CELL_ID)) / CHROM_SIZE * 2 * 100)
p <- ggplot(df, aes(CHROM_SIZE, NORM_N)) +
geom_point(alpha=0.7) +
geom_text_repel(aes(label=CHROM)) +
xlab("Chromosome size (Mb)") +
ylab("Crossover density (cM / Mb)") +
theme_classic()
p
r <- cor.test(df$NORM_N, df$CHROM_SIZE, method="pearson", exact=TRUE)
cat("pearson r:\n", r$estimate, "\np-value:\n", r$p.value, "\n")
## pearson r:
## -0.6511605
## p-value:
## 0.002529338
if (!is.na(fig_dir)) ggsave("figS7A.pdf", p, "pdf", fig_dir, width=3, height=3)
Number of M2 diploid break points normalized by chromosome size ~ chromosome size, cast
df <- m2_cast_df %>%
count(CHROM) %>%
left_join(mm10_fai_df, by="CHROM") %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
mutate(CHROM_SIZE=CHROM_SIZE / 1e6) %>%
mutate(CHROM=gsub("chr", "", CHROM)) %>%
mutate(NORM_N=n / length(unique(m2_spret_df$CELL_ID)) / CHROM_SIZE * 100)
p <- ggplot(df, aes(CHROM_SIZE, NORM_N)) +
geom_point(alpha=0.7) +
geom_text_repel(aes(label=CHROM)) +
xlab("Chromosome size (Mb)") +
ylab("Crossover density (cM / Mb)") +
theme_classic()
p
r <- cor.test(df$NORM_N, df$CHROM_SIZE, method="pearson", exact=TRUE)
cat("pearson r:\n", r$estimate, "\np-value:\n", r$p.value, "\n")
## pearson r:
## -0.8984839
## p-value:
## 1.751928e-07
if (!is.na(fig_dir)) ggsave("figS7B.pdf", p, "pdf", fig_dir, width=3, height=3)
Number of haploid break points (at least one on a chromosome) normalized by chromosome size ~ chromosome size, cast
df <- hb_cast_df %>%
distinct(CELL_ID, CHROM) %>%
count(CHROM) %>%
left_join(mm10_fai_df, by="CHROM") %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
mutate(CHROM_SIZE=CHROM_SIZE / 1e6) %>%
mutate(CHROM=gsub("chr", "", CHROM)) %>%
mutate(NORM_N=n / length(unique(hb_cast_df$CELL_ID)) / CHROM_SIZE * 2 * 100)
p <- ggplot(df, aes(CHROM_SIZE, NORM_N)) +
geom_point(alpha=0.7) +
geom_text_repel(aes(label=CHROM)) +
xlab("Chromosome size (Mb)") +
ylab("Crossover density (alt_cM / Mb)") +
theme_classic()
p
r <- cor.test(df$NORM_N, df$CHROM_SIZE, method="pearson", exact=TRUE)
cat("pearson r:\n", r$estimate, "\np-value:\n", r$p.value, "\n")
## pearson r:
## -0.8479662
## p-value:
## 4.537226e-06
if (!is.na(fig_dir)) ggsave("figS7C.pdf", p, "pdf", fig_dir, width=3, height=3)
Number of M2 diploid break points (at least one on a chromosome) normalized by chromosome size ~ chromosome size, cast
df <- m2_cast_df %>%
distinct(CELL_ID, CHROM) %>%
count(CHROM) %>%
left_join(mm10_fai_df, by="CHROM") %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
mutate(CHROM_SIZE=CHROM_SIZE / 1e6) %>%
mutate(CHROM=gsub("chr", "", CHROM)) %>%
mutate(NORM_N=n / length(unique(m2_cast_df$CELL_ID)) / CHROM_SIZE * 100)
p <- ggplot(df, aes(CHROM_SIZE, NORM_N)) +
geom_point(alpha=0.7) +
geom_text_repel(aes(label=CHROM)) +
xlab("Chromosome size (Mb)") +
ylab("Crossover density (alt_cM / Mb)") +
theme_classic()
p
r <- cor.test(df$NORM_N, df$CHROM_SIZE, method="pearson", exact=TRUE)
cat("pearson r:\n", r$estimate, "\np-value:\n", r$p.value, "\n")
## pearson r:
## -0.9393099
## p-value:
## 2.546956e-09
if (!is.na(fig_dir)) ggsave("figS7D.pdf", p, "pdf", fig_dir, width=3, height=3)
# sampled null tracts, use them to calculate null distribution of co distances
ctrl_cast_df <- read_tsv(slfile("cast/sample.srt.uniq.bed"), col_names=c("CHROM", "START", "END", "TYPE")) %>%
mutate(TYPE=case_when(TYPE=="hp" ~ "Haploid", TYPE=="m2" ~ "M2 diploid"))
# true distances
bp_cast_df <- bind_rows(hb_cast_df %>% mutate(TYPE="Haploid"),
m2_cast_df %>% mutate(TYPE="M2 diploid"))
lst <- list()
# for(type in unique(bp_cast_df$TYPE)) {
# for(chr in setdiff(unique(bp_cast_df$CHROM), c("chrX", "chrY"))) {
# chr_df <- filter(bp_cast_df, CHROM == chr & TYPE == type)
# for(cell in unique(chr_df$CELL_ID)) {
# cc_df <- filter(chr_df, CELL_ID == cell)
# if (nrow(cc_df) < 2) next
# dists <- c()
# for (i in seq_len(nrow(cc_df) - 1)) {
# j <- i + 1 # use distance of adjacent breakpoints only
# dists <- c(dists, bp_distance(cc_df[[i, "START"]], cc_df[[i, "END"]], cc_df[[j, "START"]], cc_df[[j, "END"]]))
# }
# lst[[length(lst) + 1]] <- dists
# }
# }
# }
bp_cast_distance_df <- map_df(set_names(unique(bp_cast_df$TYPE)), function(type) {
map_df(set_names(setdiff(unique(bp_cast_df$CHROM), c("chrX", "chrY"))), function(chr) {
chr_df <- filter(bp_cast_df, CHROM == chr & TYPE == type)
map_df(set_names(unique(chr_df$CELL_ID)), function(cell) {
cc_df <- filter(chr_df, CELL_ID == cell)
if (nrow(cc_df) < 2) return(NULL)
dists <- c()
for (i in seq_len(nrow(cc_df) - 1)) {
j <- i + 1 # use distance of adjacent breakpoints only
dists <- c(dists, bp_distance(cc_df[[i, "START"]], cc_df[[i, "END"]], cc_df[[j, "START"]], cc_df[[j, "END"]]))
}
return(tibble(DISTANCE=dists))
}, .id="CELL_ID")
}, .id="CHROM")
}, .id="TYPE")
bp_cast_distance_df_for_plot <- bp_cast_distance_df %>%
mutate(CHROM="ALL") %>%
bind_rows(bp_cast_distance_df)
bp_cast_distance_df_for_plot$CHROM <- factor(bp_cast_distance_df_for_plot$CHROM, levels=c(chr_levels, "ALL"))
sizes <- table(bp_cast_distance_df$CHROM)
bp_ctrl_distance_cast_df <- map_df(names(sizes), function(c) {
choices <- ctrl_cast_df %>%
filter(CHROM == c)
idx1 <- sample(seq_len(nrow(choices)))
idx2 <- sample(seq_len(nrow(choices)))
ret <- tibble()
sampled_size <- 0
while (sampled_size < sizes[c]) {
sampled_choices <- bind_cols(choices[idx1, ], choices[idx2, ]) %>%
filter((START < START1 & END < START1) | (START > END1 & START > START1)) %>%
rowwise() %>%
mutate(DISTANCE=bp_distance(START, END, START1, END1))
ret <- distinct(bind_rows(ret, sampled_choices))
ret <- filter(ret, DISTANCE > 1e5) # min_block size in call_state_block is 1e5 so
sampled_size <- nrow(ret)
}
return(ret[1:sizes[c], ])
})
bp_ctrl_distance_cast_df_for_plot <- bind_rows(bp_ctrl_distance_cast_df %>% mutate(CHROM="ALL"),
bp_ctrl_distance_cast_df)
p <- bind_rows(bp_cast_distance_df_for_plot,
bp_ctrl_distance_cast_df_for_plot %>% mutate(TYPE="Expected")) %>%
mutate(CHROM=factor(CHROM, levels=c(chr_levels, "ALL"))) %>%
# filter(CHROM %in% c("chr1", "chr2", "chr12", "chr13")) %>% # uncomment if plotting all chromosomes
ggplot(aes(DISTANCE / 1e6, fill=TYPE)) +
geom_histogram(position="dodge", bins=20) +
scale_fill_manual(name="", values=c("Haploid"="orangered", "M2 diploid"="dodgerblue", "Expected"="grey80")) +
facet_wrap(~CHROM, scales="free") +
xlab("Distance between break points (M)") +
ylab("Count") +
theme_classic() +
theme(strip.background = element_blank())
p
# if (!is.na(fig_dir)) ggsave("fig4E.pdf", p, "pdf", fig_dir, width=5, height=4) # if only plotting subset
if (!is.na(fig_dir)) ggsave("figS7E.pdf", p, "pdf", fig_dir, width=8, height=4)
# observed distance mean and median, both haploid and m2 diploid combined
bp_cast_distance_df %>%
summarise(DISTANCE_MEAN=mean(DISTANCE), DISTANCE_MEDIAN=median(DISTANCE))
## # A tibble: 1 x 2
## DISTANCE_MEAN DISTANCE_MEDIAN
## <dbl> <int>
## 1 97212487. 97127834
# simulated null distance mean and median, both haploid and m2 diploid combined
bp_ctrl_distance_cast_df %>%
summarise(DISTANCE_MEAN=mean(DISTANCE), DISTANCE_MEDIAN=median(DISTANCE))
## # A tibble: 1 x 2
## DISTANCE_MEAN DISTANCE_MEDIAN
## <dbl> <int>
## 1 49953736. 43243594
# wilcoxon test for distance
w <- wilcox.test(bp_cast_distance_df$DISTANCE, bp_ctrl_distance_cast_df$DISTANCE)
w$p.value
## [1] 0
Compare distances between breaks in B6/Spret and B6/Cast.
w <- wilcox.test(bp_spret_distance_df$DISTANCE, bp_cast_distance_df$DISTANCE)
w$p.value
## [1] 1.08051e-113
p <- bind_rows(bp_spret_distance_df %>% select(DISTANCE) %>% mutate(Strain="B6/Spret"),
bp_cast_distance_df %>% select(DISTANCE) %>% mutate(Strain="B6/Cast")) %>%
mutate(`Distance (Mb)`=DISTANCE / 1e6) %>%
ggplot(aes(Strain, `Distance (Mb)`)) +
geom_boxplot(outlier.colour="grey50", outlier.alpha=0.5) +
theme_bw() +
scale_y_continuous(labels=scales::format_format(scientific = FALSE),
breaks=scales::pretty_breaks(n=4)) +
theme(legend.position='none',
axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank())
p
if (!is.na(fig_dir)) ggsave("figS5E_2.pdf", p, "pdf", fig_dir, height=2.5, width=2)
bp_spret <- makeGRangesFromDataFrame(data.frame(
bind_rows(hb_spret_df, m2_spret_df) %>%
dplyr::rename(chr=CHROM, start=START, end=END) %>% dplyr::select(chr, start, end)))
bp_cast <- makeGRangesFromDataFrame(data.frame(
bind_rows(hb_cast_df, m2_cast_df) %>%
dplyr::rename(chr=CHROM, start=START, end=END) %>% dplyr::select(chr, start, end)))
bp_spret_hap <- makeGRangesFromDataFrame(data.frame(
hb_spret_df %>%
dplyr::rename(chr=CHROM, start=START, end=END) %>% dplyr::select(chr, start, end)))
bp_spret_m2 <- makeGRangesFromDataFrame(data.frame(
m2_spret_df %>%
dplyr::rename(chr=CHROM, start=START, end=END) %>% dplyr::select(chr, start, end)))
ssds_b6_df <- read_tsv(slfile("ssds_b6.tab"), col_names=c("chr", "start", "end", "signal"))
ssds_cast_df <- read_tsv(slfile("ssds_cast.tab"), col_names=c("chr", "start", "end", "signal"))
ssds_b6_cast_df <- read_tsv(slfile("ssds_b6_cast.tab"), col_names=c("chr", "start", "end", "signal"))
ssds_b6 <- makeGRangesFromDataFrame(data.frame(
ssds_b6_df %>% dplyr::select(chr, start, end)))
ssds_cast <- makeGRangesFromDataFrame(data.frame(
ssds_cast_df %>% dplyr::select(chr, start, end)))
ssds_b6_cast <- makeGRangesFromDataFrame(data.frame(
ssds_b6_cast_df %>% dplyr::select(chr, start, end)))
kp <- plotKaryotype(genome="mm10", chromosomes=paste0("chr", as.character(1:19)))
kpPlotCoverage(kp, bp_cast, col="green", r0=0, r1=0.19, free.y=TRUE)
kpPlotCoverage(kp, bp_spret, col="purple", r0=0.2, r1=0.39, free.y=TRUE)
kpPlotDensity(kp, ssds_b6_cast, col="grey10", r0=0.4, r1=0.59)
kpPlotDensity(kp, ssds_cast, col="orangered", r0=0.6, r1=0.79)
kpPlotDensity(kp, ssds_b6, col="dodgerblue", r0=0.8, r1=0.99)
if (!is.na(fig_dir)) {
pdf(file.path(fig_dir, "figS8A.pdf"), 8, 20)
kp <- plotKaryotype(genome="mm10", chromosomes=paste0("chr", as.character(1:19)))
kpPlotCoverage(kp, bp_cast, col="green", r0=0, r1=0.19, free.y=TRUE)
kpPlotCoverage(kp, bp_spret, col="purple", r0=0.2, r1=0.39, free.y=TRUE)
kpPlotDensity(kp, ssds_b6_cast, col="grey10", r0=0.4, r1=0.59)
kpPlotDensity(kp, ssds_cast, col="orangered", r0=0.6, r1=0.79)
kpPlotDensity(kp, ssds_b6, col="dodgerblue", r0=0.8, r1=0.99)
dev.off()
}
## quartz_off_screen
## 2
same as Fig. S8A, but with tracts for SSDS_F1, B6/Cast haploid, B6/Cast M2 diploid
ssds_f1 <- makeGRangesFromDataFrame(data.frame(
read_tsv(slfile("ssds_b6_cast.tab"), col_names=c("chr", "start", "end", "signal")) %>% distinct()))
bp_cast_hap <- makeGRangesFromDataFrame(data.frame(
hb_cast_df %>%
dplyr::rename(chr=CHROM, start=START, end=END) %>% dplyr::select(chr, start, end)))
bp_cast_m2 <- makeGRangesFromDataFrame(data.frame(
m2_cast_df %>%
dplyr::rename(chr=CHROM, start=START, end=END) %>% dplyr::select(chr, start, end)))
kp <- plotKaryotype(genome="mm10", chromosomes=paste0("chr", as.character(1:19)))
kpPlotCoverage(kp, bp_cast_m2, col="green", r0=0, r1=0.33, free.y=TRUE)
kpPlotCoverage(kp, bp_cast_hap, col="purple", r0=0.34, r1=0.66, free.y=TRUE)
kpPlotDensity(kp, ssds_f1, col="grey10", r0=0.67, r1=0.99)
if (!is.na(fig_dir)) {
pdf(file.path(fig_dir, "figS8B.pdf"), 8, 20)
kp <- plotKaryotype(genome="mm10", chromosomes=paste0("chr", as.character(1:19)))
kpPlotCoverage(kp, bp_cast_m2, col="green", r0=0, r1=0.33, free.y=TRUE)
kpPlotCoverage(kp, bp_cast_hap, col="purple", r0=0.34, r1=0.66, free.y=TRUE)
kpPlotDensity(kp, ssds_f1, col="grey10", r0=0.67, r1=0.99)
dev.off()
}
## quartz_off_screen
## 2
same as Fig. S8A, but with tracts for Spo11_both (dodgerblue), Spo11_with all hits(orangered), B6/Spret haploid(purple), B6/Spret M2 diploid (green)
spo11_both <- makeGRangesFromDataFrame(data.frame(
read_tsv(slfile("spo11_b6_and_spret.tab")) %>%
distinct() %>%
dplyr::rename(chr=CHROM, start=START, end=END, signal=SPO11_BOTH)))
spo11_all_hits <- makeGRangesFromDataFrame(data.frame(
read_tsv(slfile("B6.Spo11.withHits.txt")) %>%
dplyr::select(chr, Start, End, Hits) %>%
distinct() %>%
dplyr::rename(start=Start, end=End, signal=Hits)
))
bp_spret_hap <- makeGRangesFromDataFrame(data.frame(
hb_spret_df %>%
dplyr::rename(chr=CHROM, start=START, end=END) %>% dplyr::select(chr, start, end)))
bp_spret_m2 <- makeGRangesFromDataFrame(data.frame(
m2_spret_df %>%
dplyr::rename(chr=CHROM, start=START, end=END) %>% dplyr::select(chr, start, end)))
kp <- plotKaryotype(genome="mm10", chromosomes=paste0("chr", as.character(1:19)))
kpPlotCoverage(kp, bp_spret_m2, col="green", r0=0, r1=0.24, free.y=TRUE)
kpPlotCoverage(kp, bp_spret_hap, col="purple", r0=0.25, r1=0.49, free.y=TRUE)
kpPlotDensity(kp, spo11_all_hits, col="orangered", r0=0.50, r1=0.74)
kpPlotDensity(kp, spo11_both, col="dodgerblue", r0=0.75, r1=0.99)
if (!is.na(fig_dir)) {
pdf(file.path(fig_dir, "figS8C.pdf"), 8, 20)
kp <- plotKaryotype(genome="mm10", chromosomes=paste0("chr", as.character(1:19)))
kpPlotCoverage(kp, bp_spret_m2, col="green", r0=0, r1=0.24, free.y=TRUE)
kpPlotCoverage(kp, bp_spret_hap, col="purple", r0=0.25, r1=0.49, free.y=TRUE)
kpPlotDensity(kp, spo11_all_hits, col="orangered", r0=0.50, r1=0.74)
kpPlotDensity(kp, spo11_both, col="dodgerblue", r0=0.75, r1=0.99)
dev.off()
}
## quartz_off_screen
## 2
B6/Spret, M2 diploid, all_m2_merge, 38 TYPE=Mitotic cells, 202 TYPE=Meiotic cell 1. df1: take each chromosome, only keep the coordinate closest to CEN (smallest coordinate), ignore other crossovers on the same chromosome, plot along chr. coordinate, one color for mt, one color for me, do this for each of the 19 chr. 2. df2: same as above, only keep the coordinate farthest from CEN 3. df3: aggregate all chromosome, make one plot, normalize by chr. size, i.e., instead of showing chr. coordinate on the X axis for each chr., only make one plot, X axis from 0 (left end of chr) to 1 (right end of chr). Do the same thing for B6/Cast, M2 diploid, all_m2_merge_cast, 8 TYPE=Mitotic cells, 107 TYPE=Meiotic cell
m2_spret_l_r <- m2_spret_df %>%
right_join(filter(all_m2_merge, TYPE %in% c("Meiotic cell", "Mitotic cell")), by="CELL_ID") %>%
mutate(TYPE=case_when(TYPE == "Meiotic cell" ~ "Reductionally segregating cells",
TYPE == "Mitotic cell" ~ "Equationally segregating cells")) %>%
mutate(MID=(START + END) / 2 / 1e6) %>% # Mb
group_by(CELL_ID, CHROM, TYPE) %>%
summarise(MIN_MID=min(MID), MAX_MID=max(MID)) %>%
ungroup() %>%
mutate(CHROM=factor(CHROM, levels=chr_levels))
m2_cast_l_r <- m2_cast_df %>%
right_join(filter(all_m2_merge_cast, TYPE %in% c("Meiotic cell", "Mitotic cell")), by="CELL_ID") %>%
mutate(TYPE=case_when(TYPE == "Meiotic cell" ~ "Reductionally segregating cells",
TYPE == "Mitotic cell" ~ "Equationally segregating cells")) %>%
mutate(MID=(START + END) / 2 / 1e6) %>% # Mb
group_by(CELL_ID, CHROM, TYPE) %>%
summarise(MIN_MID=min(MID), MAX_MID=max(MID)) %>%
ungroup() %>%
mutate(CHROM=factor(CHROM, levels=chr_levels))
hb_spret_l_r <- hb_spret_df %>%
filter(CHROM %in% chr_levels) %>%
mutate(MID=(START + END) / 2 / 1e6) %>% # Mb
group_by(CELL_ID, CHROM) %>%
summarise(MIN_MID=min(MID), MAX_MID=max(MID)) %>%
ungroup() %>%
mutate(CHROM=factor(CHROM, levels=chr_levels))
hb_cast_l_r <- hb_cast_df %>%
filter(CHROM %in% chr_levels) %>%
mutate(MID=(START + END) / 2 / 1e6) %>% # Mb
group_by(CELL_ID, CHROM) %>%
summarise(MIN_MID=min(MID), MAX_MID=max(MID)) %>%
ungroup() %>%
mutate(CHROM=factor(CHROM, levels=chr_levels))
this_theme <- theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.ticks.y = element_blank(),
axis.text.y = element_blank(), axis.title.y = element_blank(),
axis.line.y=element_blank(),
axis.line.x=element_line(colour="black"),
strip.text.y=element_text(angle=180),
strip.background=element_rect(fill=NA))
trim <- TRUE
# p_spret_m2_left <- ggplot(m2_spret_l_r) +
# geom_vline(aes(xintercept=MIN_MID), alpha=0.2) +
# stat_density(aes(MIN_MID), color="orangered", geom = "line") +
# facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
# xlab("Chromosome coordinate (Mb)") +
# xlim(c(0, 200)) +
# ggtitle("B6/Spret, left most break") +
# this_theme
# md_spret_m2_left <- lmerTest::lmer(MIN_MID ~ TYPE + (1|CHROM), data=m2_spret_l_r)
p_spret_m2_right <- ggplot(m2_spret_l_r) +
geom_vline(aes(xintercept=MAX_MID), alpha=0.2) +
stat_density(aes(MAX_MID), color="orangered", geom="line", trim=trim) +
facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
xlab("Chromosome coordinate (Mb)") +
xlim(c(0, 200)) +
ggtitle("B6/Spret, right most break") +
this_theme
md_spret_m2_right <- lmerTest::lmer(MAX_MID ~ TYPE + (1|CHROM), data=m2_spret_l_r)
# p_cast_m2_left <- ggplot(m2_cast_l_r) +
# geom_vline(aes(xintercept=MIN_MID), alpha=0.2) +
# stat_density(aes(MIN_MID), color="orangered", geom = "line") +
# facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
# xlab("Chromosome coordinate (Mb)") +
# xlim(c(0, 200)) +
# ggtitle("B6/Cast, left most break") +
# this_theme
#
# md_cast_m2_left <- lmerTest::lmer(MIN_MID ~ TYPE + (1|CHROM), data=m2_cast_l_r)
p_cast_m2_right <- ggplot(m2_cast_l_r) +
geom_vline(aes(xintercept=MAX_MID), alpha=0.2) +
stat_density(aes(MAX_MID), color="orangered", geom="line", trim=trim) +
facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
xlab("Chromosome coordinate (Mb)") +
xlim(c(0, 200)) +
ggtitle("B6/Cast, right most break") +
this_theme
md_cast_m2_right <- lmerTest::lmer(MAX_MID ~ TYPE + (1|CHROM), data=m2_cast_l_r)
# p_hb_spret_cast_left <- bind_rows(hb_spret_l_r %>% mutate(TYPE="B6/Spret"),
# hb_cast_l_r %>% mutate(TYPE="B6/Cast")) %>%
# ggplot() +
# geom_vline(aes(xintercept=MIN_MID), alpha=0.02) +
# stat_density(aes(MIN_MID), color="orangered", geom = "line") +
# facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
# xlab("Chromosome coordinate (Mb)") +
# xlim(c(0, 200)) +
# ggtitle("Haploid cells, left most break") +
# this_theme
#
# md_hb_spret_cast_left <- lmerTest::lmer(MIN_MID ~ TYPE + (1|CHROM),
# data=bind_rows(hb_spret_l_r %>% mutate(TYPE="B6/Spret"),
# hb_cast_l_r %>% mutate(TYPE="B6/Cast")))
#
# S10A
p_hb_spret_cast_right <- bind_rows(hb_spret_l_r %>% mutate(TYPE="B6/Spret"),
hb_cast_l_r %>% mutate(TYPE="B6/Cast")) %>%
ggplot() +
geom_vline(aes(xintercept=MAX_MID), alpha=0.02) +
stat_density(aes(MAX_MID), color="orangered", geom="line", trim=trim) +
facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
xlab("Chromosome coordinate (Mb)") +
xlim(c(0, 200)) +
ggtitle("Haploid cells, right most break") +
this_theme
md_hb_spret_cast_right <- lmerTest::lmer(MAX_MID ~ TYPE + (1|CHROM),
data=bind_rows(hb_spret_l_r %>% mutate(TYPE="B6/Spret"),
hb_cast_l_r %>% mutate(TYPE="B6/Cast")))
#
# p_m2_spret_cast_left <- bind_rows(m2_spret_l_r %>% mutate(TYPE="B6/Spret"),
# m2_cast_l_r %>% mutate(TYPE="B6/Cast")) %>%
# ggplot() +
# geom_vline(aes(xintercept=MIN_MID), alpha=0.1) +
# stat_density(aes(MIN_MID), color="orangered", geom = "line") +
# facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
# xlab("Chromosome coordinate (Mb)") +
# xlim(c(0, 200)) +
# ggtitle("M2 diploid cells, left most break") +
# this_theme
#
# md_m2_spret_cast_left <- lmerTest::lmer(MIN_MID ~ TYPE + (1|CHROM),
# data=bind_rows(m2_spret_l_r %>% mutate(TYPE="B6/Spret"),
# m2_cast_l_r %>% mutate(TYPE="B6/Cast")))
#
# 5C
p_m2_spret_cast_right <- bind_rows(m2_spret_l_r %>% mutate(TYPE="B6/Spret"),
m2_cast_l_r %>% mutate(TYPE="B6/Cast")) %>%
ggplot() +
geom_vline(aes(xintercept=MAX_MID), alpha=0.1) +
stat_density(aes(MAX_MID), color="orangered", geom="line", trim=trim) +
facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
xlab("Chromosome coordinate (Mb)") +
xlim(c(0, 200)) +
ggtitle("M2 diploid cells, right most break") +
this_theme
md_m2_spret_cast_right <- lmerTest::lmer(MAX_MID ~ TYPE + (1|CHROM),
data=bind_rows(m2_spret_l_r %>% mutate(TYPE="B6/Spret"),
m2_cast_l_r %>% mutate(TYPE="B6/Cast")))
#
# p_spret_hb_m2_left <- bind_rows(hb_spret_l_r %>% mutate(TYPE="Haploid cells"),
# m2_spret_l_r %>% mutate(TYPE="M2 diploid cells")) %>%
# ggplot() +
# geom_vline(aes(xintercept=MIN_MID), alpha=0.02) +
# stat_density(aes(MIN_MID), color="orangered", geom = "line") +
# facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
# xlab("Chromosome coordinate (Mb)") +
# xlim(c(0, 200)) +
# ggtitle("B6/Spret, left most break") +
# this_theme
#
# md_spret_hb_m2_left <- lmerTest::lmer(MIN_MID ~ TYPE + (1|CHROM),
# data=bind_rows(hb_spret_l_r %>% mutate(TYPE="Haploid cells"),
# m2_spret_l_r %>% mutate(TYPE="M2 diploid cells")))
#
# 5D
p_spret_hb_m2_right <- bind_rows(hb_spret_l_r %>% mutate(TYPE="Haploid cells"),
m2_spret_l_r %>% mutate(TYPE="M2 cells")) %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
ggplot() +
geom_vline(aes(xintercept=MAX_MID, alpha=TYPE)) +
stat_density(aes(MAX_MID), color="orangered", geom="line", trim=trim) +
facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
xlab("Chromosome coordinate (Mb)") +
xlim(c(0, 200)) +
scale_alpha_manual(guide=FALSE, values=c("Haploid cells"=0.02, "M2 cells"=0.1)) +
ggtitle("B6/Spret, right most break") +
this_theme
md_spret_hb_m2_right <- lmerTest::lmer(MAX_MID ~ TYPE + (1|CHROM),
data=bind_rows(hb_spret_l_r %>% mutate(TYPE="Haploid cells"),
m2_spret_l_r %>% mutate(TYPE="M2 cells")))
#
# p_cast_hb_m2_left <- bind_rows(hb_cast_l_r %>% mutate(TYPE="Haploid cells"),
# m2_cast_l_r %>% mutate(TYPE="M2 diploid cells")) %>%
# ggplot() +
# geom_vline(aes(xintercept=MIN_MID), alpha=0.02) +
# stat_density(aes(MIN_MID), color="orangered", geom = "line") +
# facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
# xlab("Chromosome coordinate (Mb)") +
# xlim(c(0, 200)) +
# ggtitle("B6/Cast, left most break") +
# this_theme
#
# md_cast_hb_m2_left <- lmerTest::lmer(MIN_MID ~ TYPE + (1|CHROM),
# data=bind_rows(hb_cast_l_r %>% mutate(TYPE="Haploid cells"),
# m2_cast_l_r %>% mutate(TYPE="M2 diploid cells")))
#
# S10B
p_cast_hb_m2_right <- bind_rows(hb_cast_l_r %>% mutate(TYPE="Haploid cells"),
m2_cast_l_r %>% mutate(TYPE="M2 cells")) %>%
filter(!CHROM %in% c("chrX", "chrY")) %>%
ggplot() +
geom_vline(aes(xintercept=MAX_MID, alpha=TYPE)) +
stat_density(aes(MAX_MID), color="orangered", geom="line", trim=trim) +
facet_grid(CHROM~TYPE, scales="free_y", switch="y") +
xlab("Chromosome coordinate (Mb)") +
xlim(c(0, 200)) +
scale_alpha_manual(guide=FALSE, values=c("Haploid cells"=0.01, "M2 cells"=0.2)) +
ggtitle("B6/Cast, right most break") +
this_theme
md_cast_hb_m2_right <- lmerTest::lmer(MAX_MID ~ TYPE + (1|CHROM),
data=bind_rows(hb_cast_l_r %>% mutate(TYPE="Haploid cells"),
m2_cast_l_r %>% mutate(TYPE="M2 cells")))
p_m2_spret_cast_right
p_spret_hb_m2_right
p_hb_spret_cast_right
p_cast_hb_m2_right
p_cast_m2_right
p_spret_m2_right
library(gridExtra)
if (!is.na(fig_dir)) {
pdf(file.path(fig_dir, "fig6CD.pdf"), 6.5, 5.5)
print(p_m2_spret_cast_right)
grid::grid.newpage()
grid.table(as.data.frame(summary(md_m2_spret_cast_right)$coefficients)[, c(1,5)])
print(p_spret_hb_m2_right)
grid::grid.newpage()
grid.table(as.data.frame(summary(md_spret_hb_m2_right)$coefficients)[, c(1,5)])
dev.off()
pdf(file.path(fig_dir, "figS12A.pdf"), 6.5, 5.5)
print(p_hb_spret_cast_right)
grid::grid.newpage()
grid.table(as.data.frame(summary(md_hb_spret_cast_right)$coefficients)[, c(1,5)])
dev.off()
pdf(file.path(fig_dir, "figS12B.pdf"), 6.5, 5.5)
print(p_cast_hb_m2_right)
grid::grid.newpage()
grid.table(as.data.frame(summary(md_cast_hb_m2_right)$coefficients)[, c(1,5)])
dev.off()
pdf(file.path(fig_dir, "figS12C.pdf"), 6.5, 5.5)
print(p_cast_m2_right)
grid::grid.newpage()
grid.table(as.data.frame(summary(md_cast_m2_right)$coefficients)[, c(1,5)])
dev.off()
pdf(file.path(fig_dir, "figS12D.pdf"), 6.5, 5.5)
print(p_spret_m2_right)
grid::grid.newpage()
grid.table(as.data.frame(summary(md_spret_m2_right)$coefficients)[, c(1,5)])
dev.off()
}
## quartz_off_screen
## 2
p1 <- ggplot(bind_rows(hb_spret_df, m2_spret_df) %>% mutate(DISTANCE=DISTANCE / 1e3), aes(DISTANCE)) +
geom_histogram(binwidth=0.1) +
scale_x_log10(labels=scales::format_format(scientific = FALSE, digits=0)) +
xlab("Break point size (kb)") +
ylab("Count") +
theme_classic() +
theme(axis.text = element_text(hjust = 1)) +
ggtitle("B6/Spret")
p2 <- ggplot(bind_rows(hb_cast_df, m2_cast_df) %>% mutate(DISTANCE=DISTANCE / 1e3), aes(DISTANCE)) +
geom_histogram(binwidth=0.1) +
scale_x_log10(labels=scales::format_format(scientific = FALSE, digits=0)) +
xlab("Break point size (kb)") +
ylab("Count") +
theme_classic() +
theme(axis.text = element_text(hjust = 1)) +
ggtitle("B6/Cast")
p1
p2
if (!is.na(fig_dir)) {
pdf(file.path(fig_dir, "fig6B.pdf"), 2.5, 2.5)
p1
p2
dev.off()
}
## quartz_off_screen
## 2
load(slfile("features.rda"))
spret_cast_window_data_df <- as.data.frame(spret_cast_window_data)
spret_cast_window_data_df <- spret_cast_window_data_df[!grepl("^event_", colnames(spret_cast_window_data_df)) | colnames(spret_cast_window_data_df) == "event_cast_spret"]
spret_cast_window_lm <- lm(log1p(event_cast_spret) ~ ., data=spret_cast_window_data_df)
if (!is.na(fig_dir)) {
sink(file.path(fig_dir, "spret_cast_window_lm_output.txt"))
print(summary(spret_cast_window_lm))
sink()
}
set.seed(12345)
spret_cast_window_bas <- bas.lm(log1p(event_cast_spret) ~ ., data=spret_cast_window_data_df, n.models=2^14, method='BAS')
p <- bas_plot(spret_cast_window_bas)
if (!is.na(fig_dir)) {
ggsave(file.path(fig_dir, "fig6A.pdf"), p, "pdf", width=7, height=12)
sink(file.path(fig_dir, "bas_spret_cast_coef.txt"))
print(coef(spret_cast_window_bas))
sink()
}
cast_window_data_df <- as.data.frame(cast_window_data)
cast_window_data_df <- cast_window_data_df[!grepl("^event_", colnames(cast_window_data_df)) | colnames(cast_window_data_df) == "event_hp_m2"]
cast_window_lm <- lm(log1p(event_hp_m2) ~ ., data=cast_window_data_df)
if (!is.na(fig_dir)) {
sink(file.path(fig_dir, "cast_window_lm_output.txt"))
print(summary(cast_window_lm))
sink()
}
set.seed(12345)
cast_window_bas <- bas.lm(log1p(event_hp_m2) ~ ., data=cast_window_data_df, n.models=2^14, method='BAS')
p <- bas_plot(cast_window_bas)
if (!is.na(fig_dir)) {
ggsave(file.path(fig_dir, "figS9A.pdf"), p, "pdf", width=8, height=9)
sink(file.path(fig_dir, "bas_cast_coef.txt"))
print(coef(cast_window_bas))
sink()
}
spret_window_data_df <- as.data.frame(spret_window_data)
spret_window_data_df <- spret_window_data_df[!grepl("^event_", colnames(spret_window_data_df)) | colnames(spret_window_data_df) == "event_hp_m2"]
spret_window_lm <- lm(log1p(event_hp_m2) ~ ., data=spret_window_data_df)
if (!is.na(fig_dir)) {
sink(file.path(fig_dir, "spret_window_lm_output.txt"))
print(summary(spret_window_lm))
sink()
}
set.seed(12345)
spret_window_bas <- bas.lm(log1p(event_hp_m2) ~ ., data=spret_window_data_df, n.models=2^14, method='BAS')
p <- bas_plot(spret_window_bas)
if (!is.na(fig_dir)) {
ggsave(file.path(fig_dir, "figS9B.pdf"), p, "pdf", width=8, height=9)
sink(file.path(fig_dir, "bas_spret_coef.txt"))
print(coef(spret_window_bas))
sink()
}
window_spret_features_for_cor <- window_spret_with_cor %>%
dplyr::select(-c(CHROM, START, END, DISTANCE, BP_MID, CHROM_SIZE, QUANTILE_25, QUANTILE_50, QUANTILE_75)) %>%
mutate_if(sapply(., is.character), function(x) {ifelse(x == "Yes", 1, 0)})
window_spret_features_cor <- cor(window_spret_features_for_cor, use='pairwise.complete.obs', method='spearman')
dim(window_spret_features_cor)
## [1] 80 80
window_spret_features_order <- corrplot::corrMatOrder(window_spret_features_cor[1:75, 1:75], order = "hclust", hclust.method = "complete")
window_spret_features_order <- c(c(76:80), window_spret_features_order) # put responses on topleft
corrplot::corrplot(window_spret_features_cor[window_spret_features_order, window_spret_features_order], tl.cex=0.5, type="upper", method="color", tl.col=c(rep("red", 5), rep("steelblue", 75)))
if (!is.na(fig_dir)) {
pdf(file.path(fig_dir, "figS10.pdf"), 10, 10)
corrplot::corrplot(window_spret_features_cor[window_spret_features_order, window_spret_features_order], tl.cex=0.5, type="upper", method="color", tl.col=c(rep("red", 5), rep("steelblue", 75)))
dev.off()
}
## quartz_off_screen
## 2
window_cast_features_for_cor <- window_cast_with_cor %>%
select(-c(CHROM, START, END, DISTANCE, BP_MID, CHROM_SIZE, QUANTILE_25, QUANTILE_50, QUANTILE_75)) %>%
mutate_if(sapply(., is.character), function(x) {ifelse(x == "Yes", 1, 0)})
window_cast_features_cor <- cor(window_cast_features_for_cor, use='pairwise.complete.obs', method='spearman')
dim(window_cast_features_cor)
## [1] 73 73
window_cast_features_order <- corrplot::corrMatOrder(window_cast_features_cor[1:68, 1:68], order = "hclust", hclust.method = "complete")
window_cast_features_order <- c(c(69:73), window_cast_features_order)
corrplot::corrplot(window_cast_features_cor[window_cast_features_order, window_cast_features_order], tl.cex=0.5, type="upper", method="color", tl.col=c(rep("red", 5), rep("steelblue", 68)))
if (!is.na(fig_dir)) {
pdf(file.path(fig_dir, "figS11.pdf"), 10, 10)
corrplot::corrplot(window_cast_features_cor[window_cast_features_order, window_cast_features_order], tl.cex=0.5, type="upper", method="color", tl.col=c(rep("red", 5), rep("steelblue", 68)))
dev.off()
}
## quartz_off_screen
## 2
cast_spret_combined_events <- list(
event_cast_spret = read_tsv(slfile("event_window.cast_spret.bed"), col_names=c("CHROM", "START", "END", "event_cast_spret")),
event_hp_cast_spret = read_tsv(slfile("haploid_window.cast_spret.bed"), col_names=c("CHROM", "START", "END", "event_hp_cast_spret")),
event_m2_cast_spret = read_tsv(slfile("m2_window.cast_spret.bed"), col_names=c("CHROM", "START", "END", "event_m2_cast_spret")),
event_mt_cast_spret = read_tsv(slfile("mt_window.cast_spret.bed"), col_names=c("CHROM", "START", "END", "event_mt_cast_spret")),
event_me_cast_spret = read_tsv(slfile("me_window.cast_spret.bed"), col_names=c("CHROM", "START", "END", "event_me_cast_spret")))
cast_spret_combined_events_df <- bind_cols(map(cast_spret_combined_events, ~.x[, 4]))
window_spret_cast_features_for_cor <- bind_cols(
window_spret_features_for_cor %>% dplyr::select(-starts_with("event_")),
window_cast_features_for_cor[setdiff(colnames(window_cast_features_for_cor), colnames(window_spret_features_for_cor))],
cast_spret_combined_events_df)
window_spret_cast_features_cor <- cor(window_spret_cast_features_for_cor, use='pairwise.complete.obs', method='spearman')
dim(window_spret_cast_features_cor)
## [1] 86 86
window_spret_cast_features_order <- corrplot::corrMatOrder(window_spret_cast_features_cor[1:81, 1:81], order = "hclust", hclust.method = "complete")
window_spret_cast_features_order <- c(c(82:86), window_spret_cast_features_order)
corrplot::corrplot(window_spret_cast_features_cor[window_spret_cast_features_order, window_spret_cast_features_order], tl.cex=0.5, type="upper", method="color", tl.col=c(rep("red", 5), rep("steelblue", 81)))
if (FALSE) {
pdf(file.path(fig_dir, "figSSS.pdf"), 10, 10)
corrplot::corrplot(window_spret_cast_features_cor[window_spret_cast_features_order, window_spret_cast_features_order], tl.cex=0.5, type="upper", method="color", tl.col=c(rep("red", 5), rep("steelblue", 81)))
dev.off()
}
plot_cell_pca_simp <- function(cell_pca, cell_df, x="PC1", y="PC2") {
s <- summary(cell_pca)
s1 <- s$importance["Proportion of Variance", x] * 100
s2 <- s$importance["Proportion of Variance", y] * 100
cell_pca_df <- cell_df %>%
mutate(PCX=cell_pca$x[, x],
PCY=cell_pca$x[, y]) %>%
mutate(TYPE=R.utils::capitalize(TYPE))
p <- ggplot(cell_pca_df, aes(PCX, PCY)) +
geom_point(aes(color=TYPE), alpha=0.4) +
scale_color_manual(values=c("Haploid"="dodgerblue", "M2 diploid"="orangered"), name="") +
xlab(sprintf("%s (%1.2f%%)", x, s1)) +
ylab(sprintf("%s (%1.2f%%)", y, s2)) +
theme_classic()
return(p)
}
plot_cell_pca <- function(cell_pca, cell_df, x="PC1", y="PC2", cols=c()) {
require(viridis)
require(gridExtra)
s <- summary(cell_pca)
s1 <- s$importance["Proportion of Variance", x] * 100
s2 <- s$importance["Proportion of Variance", y] * 100
cell_pca_df <- cell_df %>%
mutate(PCX=cell_pca$x[, x],
PCY=cell_pca$x[, y])
if (length(cols) == 0) cols <- setdiff(names(cell_pca_df)[-1], c("PCX", "PCY"))
plts <- map(cols, function(c) {
if (is.numeric(cell_pca_df[[c]])) {
this_df <- cell_pca_df
this_df[[c]] <- log1p(cell_pca_df[[c]])
} else {
this_df <- cell_pca_df
}
ggplot(this_df, aes(PCX, PCY)) +
geom_point(aes_string(color=c), alpha=0.6) +
scale_color_viridis(name=gsub("_upd", "_upc", c), discrete=!is.numeric(this_df[[c]])) +
# last minute change" chr3_upd to chr3_upc because it apparently confused people
xlab(sprintf("%s (%1.2f%%)", x, s1)) +
ylab(sprintf("%s (%1.2f%%)", y, s2)) +
theme_classic() +
theme(legend.position="bottom",
legend.title=element_text(size=25),
legend.text=element_text(size=12),
axis.title=element_text(size=25))
})
return(plts)
}
Figure too big so not embedded in html, see png.
spret_cell_pca <- prcomp((spret_cell_df_with_cor[-(1:2)]), scale.=T)
spret_sub_cols <- c("TYPE","chr3_bp","chr1_upd","TELOMERIC","QUANTILE_0_25_FLAG","SPO11_BOTH","SPO11_B6","SPO11_SPRET","SPO11_OTHER","CNV_SPRET_LOSS","CTCF","RNA_POL2","H3K27AC","H3K36ME3","H3K4ME3","LONG_RNA_SEQ","DMRT6","H3K27ME3","PS_ATAC","PS_H3K27AC","PS_COHESIN","PS_SMC1B_LAP","H4K5AC","H4K5BU","H4K8AC","H4K8BU","CTCFL","CTCF_B6","CTCF_SPRET","END_SEQ_B6_ETO","END_SEQ_SPRET_ETO","RAD21_B6","RAD21_SPRET","TOP2A_MEF","TOP2B_MEF","PATSKI_ALLELIC_ATAC_B6","PATSKI_ALLELIC_ATAC_SPRET","PATSKI_ALLELIC_CTCF_B6","PATSKI_ALLELIC_CTCF_SPRET","P7GONIA","GENE","LNC_RNA","KAC","N_MT","N_ME")
if (!is.na(fig_dir)) {
png(file.path(fig_dir, "figS13.png"), 8000, 6000, res=200)
do.call("grid.arrange", c(plot_cell_pca(spret_cell_pca, spret_cell_df_with_cor, cols=spret_sub_cols), ncol=8))
dev.off()
}
## quartz_off_screen
## 2
Figure too big so not embedded in html, see png.
cast_cell_pca <- prcomp((cast_cell_df_with_cor[-(1:2)]), scale.=T)
cast_sub_cols <- c("TYPE", "chr3_bp","chr1_upd","QUANTILE_25_75_FLAG","CTCF","RNA_POL2","H3K27AC","H3K36ME3","DMRT6","H3K27ME3","PS_H3K27AC","PS_COHESIN","PS_SMC1B_LAP","CTCFL","END_SEQ_B6_ETO","RAD21_B6","PATSKI_ALLELIC_ATAC_B6","PATSKI_ALLELIC_CTCF_B6","CNV_CAST_LOSS","KAC")
if (!is.na(fig_dir)) {
png(file.path(fig_dir, "figS14.png"), 6000, 4000, res=200)
do.call("grid.arrange", c(plot_cell_pca(cast_cell_pca, cast_cell_df_with_cor, "PC1", "PC3", cols=cast_sub_cols), ncol=5))
dev.off()
}
## quartz_off_screen
## 2
p1 <- plot_cell_pca_simp(spret_cell_pca, spret_cell_df_with_cor, "PC1", "PC2")
p2 <- plot_cell_pca_simp(cast_cell_pca, cast_cell_df_with_cor, "PC1", "PC3")
p1
p2
if (!is.na(fig_dir)) {
pdf(file.path(fig_dir, "old_fig6B.pdf"), 4, 3)
print(p1)
print(p2)
dev.off()
}
md_spret_data <- bind_rows(hb_spret_df_with_cor %>% add_column(RESPONSE="Y", TYPE="hp", .before=1),
m2_spret_df_with_cor %>% add_column(RESPONSE="Y", TYPE="m2", .before=1),
hb_spret_ctrl_with_cor %>% add_column(RESPONSE="N", TYPE="hp", .before=1),
m2_spret_ctrl_with_cor %>% add_column(RESPONSE="N", TYPE="m2", .before=1)) %>%
select(-c(CELL_ID, CHROM, START, END, SCORE, STRAND, DISTANCE, BP_MID, CHROM_SIZE, QUANTILE_25, QUANTILE_50, QUANTILE_75, NOTE, SEGREGATION))
table(md_spret_data$RESPONSE)
##
## N Y
## 23785 23785
if (!is.na(fig_dir)) saveRDS(md_spret_data, file.path(fig_dir, "180820_md_spret_data.rds"))
spret_bma_coef <- coef(spret_window_bas)
spret_bma_top_predictors <- setdiff(spret_bma_coef$namesx[spret_bma_coef$probne0 > 0.5], "Intercept")
use_spret_cols <- c('RESPONSE', 'CENTROMERIC', 'TELOMERIC', 'QUANTILE_25_75_FLAG', 'QUANTILE_75_100_FLAG', 'GC', 'SPO11_BOTH', 'CNV_SPRET_GAIN', 'CNV_SPRET_LOSS', 'THREE_PRIME_UTR', 'GENE', 'PSEUDOGENIC_TRANSCRIPT', 'RMSK_DNA', 'RMSK_LINE', 'RMSK_LTR', 'RMSK_LC', 'RMSK_SINE', 'RMSK_SATE', 'SEQ_DIV_SPRET', 'H3K36ME3', 'H3K4ME1', 'DMRT6', 'H3K27ME3', 'CTCFL', 'MESC_REPLI_SEQ', 'TOP2B_MEF', 'PATSKI_ALLELIC_CTCF_B6', 'PATSKI_ALLELIC_CTCF_SPRET', 'P7GONIA')
assertthat::assert_that(setequal(c('RESPONSE', spret_bma_top_predictors), use_spret_cols))
## [1] TRUE
md_spret_sub_data <- md_spret_data[use_spret_cols]
if (!is.na(fig_dir)) saveRDS(md_spret_sub_data, file.path(fig_dir, "180820_md_spret_sub_data.rds"))
md_cast_data <- bind_rows(hb_cast_df_with_cor %>% add_column(RESPONSE="Y", TYPE="hp", .before=1),
m2_cast_df_with_cor %>% add_column(RESPONSE="Y", TYPE="m2", .before=1),
hb_cast_ctrl_with_cor %>% add_column(RESPONSE="N", TYPE="hp", .before=1),
m2_cast_ctrl_with_cor %>% add_column(RESPONSE="N", TYPE="m2", .before=1)) %>%
select(-c(CELL_ID, CHROM, START, END, SCORE, STRAND, DISTANCE, BP_MID, CHROM_SIZE, QUANTILE_25, QUANTILE_50, QUANTILE_75, NOTE, SEGREGATION)) %>%
drop_na()
table(md_cast_data$RESPONSE) # slightly different
##
## N Y
## 62715 63001
if (!is.na(fig_dir)) saveRDS(md_cast_data, file.path(fig_dir, "180820_md_cast_data.rds"))
cast_bma_coef <- coef(cast_window_bas)
cast_bma_top_predictors <- setdiff(cast_bma_coef$namesx[cast_bma_coef$probne0 > 0.5], "Intercept")
use_cast_cols <- c("RESPONSE","CENTROMERIC","TELOMERIC","QUANTILE_0_25_FLAG","QUANTILE_25_75_FLAG","GC","SSDS_F1","SSDS_B6","SSDS_CAST","CNV_CAST_GAIN","THREE_PRIME_UTR","GENE","PSEUDOGENIC_TRANSCRIPT","RMSK_DNA","RMSK_LINE","RMSK_LC","RMSK_SIMPLE_REPEAT","SEQ_DIV_CAST","H3K4ME1","H3K4ME3","DMRT6","H3K27ME3","MESC_REPLI_SEQ","TOP2A_MEF","PATSKI_ALLELIC_CTCF_B6","P7GONIA")
assertthat::assert_that(setequal(c('RESPONSE', cast_bma_top_predictors), use_cast_cols))
## [1] TRUE
md_cast_sub_data <- md_cast_data[use_cast_cols]
if (!is.na(fig_dir)) saveRDS(md_cast_sub_data, file.path(fig_dir, "180820_md_cast_sub_data.rds"))
Run these on a server.
setwd("/net/shendure/vol8/projects/fly_recomb/nobackup/alignment/crossover_summaries/features/sci-lianti-files/features/rf")
library(tidyverse)
library(caret)
library(parallel)
library(doParallel)
n_cores <- detectCores()
cluster <- makeCluster(min(n_cores - 1, 8)) # convention to leave 1 core for OS
registerDoParallel(cluster)
md_data <- readRDS("180820_md_spret_data.rds")
set.seed(12345)
spret_rf_ncv <- lapply(1:5, function(x) {
train_n <- round(nrow(md_data) * 0.9)
train_idx <- sample(1:nrow(md_data), size=train_n, replace=FALSE)
train_df <- md_data[train_idx, ]
train_df$RESPONSE <- factor(train_df$RESPONSE)
test_df <- md_data[-train_idx, ]
train_control <- trainControl(method = "cv", number = 5, returnResamp = "all",
classProbs = TRUE,
summaryFunction = twoClassSummary, allowParallel = TRUE)
model_weights <- ifelse(train_df$RESPONSE == "Y",
(1/table(train_df$RESPONSE)["Y"]) * 0.5,
(1/table(train_df$RESPONSE)["N"]) * 0.5)
md <- train(RESPONSE ~ ., data = train_df,
method = "rf",
trControl = train_control,
weights = model_weights,
metric = "ROC",
na.action = "na.omit")
pred <- predict(md, newdata=test_df, type = "prob", na.action="na.omit")
pred_df <- tibble(truth=test_df[complete.cases(test_df), ]$RESPONSE, pred=pred$Y)
ret <- list("md"=md,
"pred"=pred,
"pred_df"=pred_df)
return(ret)
})
spret_rf_ncv_pred_df <- map_df(spret_rf_ncv, ~.x$pred_df)
saveRDS(spret_rf_ncv_pred_df, "180820_spret_rf_ncv_pred_df.rds")
saveRDS(spret_rf_ncv, "180820_spret_rf_ncv.rds")
setwd("/net/shendure/vol8/projects/fly_recomb/nobackup/alignment/crossover_summaries/features/sci-lianti-files/features/rf")
library(tidyverse)
library(caret)
library(parallel)
library(doParallel)
n_cores <- detectCores()
cluster <- makeCluster(min(n_cores - 1, 8)) # convention to leave 1 core for OS
registerDoParallel(cluster)
md_data <- readRDS("180820_md_spret_sub_data.rds")
set.seed(12345)
spret_sub_rf_ncv <- lapply(1:5, function(x) {
train_n <- round(nrow(md_data) * 0.9)
train_idx <- sample(1:nrow(md_data), size=train_n, replace=FALSE)
train_df <- md_data[train_idx, ]
train_df$RESPONSE <- factor(train_df$RESPONSE)
test_df <- md_data[-train_idx, ]
train_control <- trainControl(method = "cv", number = 5, returnResamp = "all",
classProbs = TRUE,
summaryFunction = twoClassSummary, allowParallel = TRUE)
model_weights <- ifelse(train_df$RESPONSE == "Y",
(1/table(train_df$RESPONSE)["Y"]) * 0.5,
(1/table(train_df$RESPONSE)["N"]) * 0.5)
md <- train(RESPONSE ~ ., data = train_df,
method = "rf",
trControl = train_control,
weights = model_weights,
metric = "ROC",
na.action = "na.omit")
pred <- predict(md, newdata=test_df, type = "prob", na.action="na.omit")
pred_df <- tibble(truth=test_df[complete.cases(test_df), ]$RESPONSE, pred=pred$Y)
ret <- list("md"=md,
"pred"=pred,
"pred_df"=pred_df)
return(ret)
})
spret_sub_rf_ncv_pred_df <- map_df(spret_sub_rf_ncv, ~.x$pred_df)
saveRDS(spret_sub_rf_ncv_pred_df, "180820_spret_sub_rf_ncv_pred_df.rds")
saveRDS(spret_sub_rf_ncv, "180820_spret_sub_rf_ncv.rds")
setwd("/net/shendure/vol8/projects/fly_recomb/nobackup/alignment/crossover_summaries/features/sci-lianti-files/features/rf")
library(tidyverse)
library(caret)
library(parallel)
library(doParallel)
n_cores <- detectCores()
cluster <- makeCluster(min(n_cores - 1, 8)) # convention to leave 1 core for OS
registerDoParallel(cluster)
md_data <- readRDS("180820_md_cast_data.rds")
set.seed(12345)
cast_rf_ncv <- lapply(1:5, function(x) {
train_n <- round(nrow(md_data) * 0.9)
train_idx <- sample(1:nrow(md_data), size=train_n, replace=FALSE)
train_df <- md_data[train_idx, ]
train_df$RESPONSE <- factor(train_df$RESPONSE)
test_df <- md_data[-train_idx, ]
train_control <- trainControl(method = "cv", number = 5, returnResamp = "all",
classProbs = TRUE,
summaryFunction = twoClassSummary, allowParallel = TRUE)
model_weights <- ifelse(train_df$RESPONSE == "Y",
(1/table(train_df$RESPONSE)["Y"]) * 0.5,
(1/table(train_df$RESPONSE)["N"]) * 0.5)
md <- train(RESPONSE ~ ., data = train_df,
method = "rf",
trControl = train_control,
weights = model_weights,
metric = "ROC",
na.action = "na.omit")
pred <- predict(md, newdata=test_df, type = "prob", na.action="na.omit")
pred_df <- tibble(truth=test_df[complete.cases(test_df), ]$RESPONSE, pred=pred$Y)
ret <- list("md"=md,
"pred"=pred,
"pred_df"=pred_df)
return(ret)
})
cast_rf_ncv_pred_df <- map_df(cast_rf_ncv, ~.x$pred_df)
saveRDS(cast_rf_ncv_pred_df, "180820_cast_rf_ncv_pred_df.rds")
saveRDS(cast_rf_ncv, "180820_cast_rf_ncv.rds")
setwd("/net/shendure/vol8/projects/fly_recomb/nobackup/alignment/crossover_summaries/features/sci-lianti-files/features/rf")
library(tidyverse)
library(caret)
library(parallel)
library(doParallel)
n_cores <- detectCores()
cluster <- makeCluster(min(n_cores - 1, 8)) # convention to leave 1 core for OS
registerDoParallel(cluster)
md_data <- readRDS("180820_md_cast_sub_data.rds")
set.seed(12345)
cast_sub_rf_ncv <- lapply(1:5, function(x) {
train_n <- round(nrow(md_data) * 0.9)
train_idx <- sample(1:nrow(md_data), size=train_n, replace=FALSE)
train_df <- md_data[train_idx, ]
train_df$RESPONSE <- factor(train_df$RESPONSE)
test_df <- md_data[-train_idx, ]
train_control <- trainControl(method = "cv", number = 5, returnResamp = "all",
classProbs = TRUE,
summaryFunction = twoClassSummary, allowParallel = TRUE)
model_weights <- ifelse(train_df$RESPONSE == "Y",
(1/table(train_df$RESPONSE)["Y"]) * 0.5,
(1/table(train_df$RESPONSE)["N"]) * 0.5)
md <- train(RESPONSE ~ ., data = train_df,
method = "rf",
trControl = train_control,
weights = model_weights,
metric = "ROC",
na.action = "na.omit")
pred <- predict(md, newdata=test_df, type = "prob", na.action="na.omit")
pred_df <- tibble(truth=test_df[complete.cases(test_df), ]$RESPONSE, pred=pred$Y)
ret <- list("md"=md,
"pred"=pred,
"pred_df"=pred_df)
return(ret)
})
cast_sub_rf_ncv_pred_df <- map_df(cast_sub_rf_ncv, ~.x$pred_df)
saveRDS(cast_sub_rf_ncv_pred_df, "180820_cast_sub_rf_ncv_pred_df.rds")
saveRDS(cast_sub_rf_ncv, "180820_cast_sub_rf_ncv.rds")
spret_rf_ncv_pred_df <- readRDS(slfile("180820_spret_rf_ncv_pred_df.rds"))
spret_sub_rf_ncv_pred_df <- readRDS(slfile("180820_spret_sub_rf_ncv_pred_df.rds"))
cast_rf_ncv_pred_df <- readRDS(slfile("180820_cast_rf_ncv_pred_df.rds"))
cast_sub_rf_ncv_pred_df <- readRDS(slfile("180820_cast_sub_rf_ncv_pred_df.rds"))
spret_rf_roc <- roc(truth ~ pred, spret_rf_ncv_pred_df)
spret_sub_rf_roc <- roc(truth ~ pred, spret_sub_rf_ncv_pred_df)
cast_rf_roc <- roc(truth ~ pred, cast_rf_ncv_pred_df)
cast_sub_rf_roc <- roc(truth ~ pred, cast_sub_rf_ncv_pred_df)
spret_rf_roc
##
## Call:
## roc.formula(formula = truth ~ pred, data = spret_rf_ncv_pred_df)
##
## Data: pred in 11871 controls (truth N) < 11914 cases (truth Y).
## Area under the curve: 0.7396
spret_sub_rf_roc
##
## Call:
## roc.formula(formula = truth ~ pred, data = spret_sub_rf_ncv_pred_df)
##
## Data: pred in 11918 controls (truth N) < 11867 cases (truth Y).
## Area under the curve: 0.7209
cast_rf_roc
##
## Call:
## roc.formula(formula = truth ~ pred, data = cast_rf_ncv_pred_df)
##
## Data: pred in 31549 controls (truth N) < 31311 cases (truth Y).
## Area under the curve: 0.8471
cast_sub_rf_roc
##
## Call:
## roc.formula(formula = truth ~ pred, data = cast_sub_rf_ncv_pred_df)
##
## Data: pred in 31290 controls (truth N) < 31570 cases (truth Y).
## Area under the curve: 0.8506
ncv_roc_df <- bind_rows(
tibble(Sensitivity=spret_rf_roc$sensitivities, Specificity=spret_rf_roc$specificities) %>%
mutate(Strain="B6/Spret", Predictors="All features"),
tibble(Sensitivity=spret_sub_rf_roc$sensitivities, Specificity=spret_sub_rf_roc$specificities) %>%
mutate(Strain="B6/Spret", Predictors="Top BMA features"),
tibble(Sensitivity=cast_rf_roc$sensitivities, Specificity=cast_rf_roc$specificities) %>%
mutate(Strain="B6/Cast", Predictors="All features"),
tibble(Sensitivity=cast_sub_rf_roc$sensitivities, Specificity=cast_sub_rf_roc$specificities) %>%
mutate(Strain="B6/Cast", Predictors="Top BMA features")
)
p1 <- ggplot(ncv_roc_df %>% filter(Strain == "B6/Spret"), aes(Specificity, Sensitivity)) +
geom_segment(x=0, y=1, xend=-1, yend=0, color="grey50", size=0.3) +
geom_line(color="orangered") +
facet_wrap(~Predictors, nrow=1) +
xlim(1, 0) +
ylim(0, 1) +
theme_classic()
p2 <- ggplot(ncv_roc_df %>% filter(Strain == "B6/Cast"), aes(Specificity, Sensitivity)) +
geom_segment(x=0, y=1, xend=-1, yend=0, color="grey50", size=0.3) +
geom_line(color="orangered") +
facet_wrap(~Predictors, nrow=1) +
xlim(1, 0) +
ylim(0, 1) +
theme_classic()
p1
p2
if (!is.na(fig_dir)) {
ggsave("fig6E.pdf", p1, "pdf", fig_dir, width=6, height=2.3)
ggsave("fig6F.pdf", p2, "pdf", fig_dir, width=6, height=2.3)
}